fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include "qemu/bitops.h"
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90
  91 /*----------------------------------------------------------------------------
  92 | Primitive arithmetic functions, including multi-word arithmetic, and
  93 | division and square root approximations.  (Can be specialized to target if
  94 | desired.)
  95 *----------------------------------------------------------------------------*/
  96 #include "fpu/softfloat-macros.h"
  97
  98 /*----------------------------------------------------------------------------
  99 | Functions and definitions to determine:  (1) whether tininess for underflow
 100 | is detected before or after rounding by default, (2) what (if anything)
 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 103 | are propagated from function inputs to output.  These details are target-
 104 | specific.
 105 *----------------------------------------------------------------------------*/
 106 #include "softfloat-specialize.h"
 107
 108 /*----------------------------------------------------------------------------
 109 | Returns the fraction bits of the half-precision floating-point value `a'.
 110 *----------------------------------------------------------------------------*/
 111
 112 static inline uint32_t extractFloat16Frac(float16 a)
 113 {
 114     return float16_val(a) & 0x3ff;
 115 }
 116
 117 /*----------------------------------------------------------------------------
 118 | Returns the exponent bits of the half-precision floating-point value `a'.
 119 *----------------------------------------------------------------------------*/
 120
 121 static inline int extractFloat16Exp(float16 a)
 122 {
 123     return (float16_val(a) >> 10) & 0x1f;
 124 }
 125
 126 /*----------------------------------------------------------------------------
 127 | Returns the sign bit of the single-precision floating-point value `a'.
 128 *----------------------------------------------------------------------------*/
 129
 130 static inline flag extractFloat16Sign(float16 a)
 131 {
 132     return float16_val(a)>>15;
 133 }
 134
 135 /*----------------------------------------------------------------------------
 136 | Returns the fraction bits of the single-precision floating-point value `a'.
 137 *----------------------------------------------------------------------------*/
 138
 139 static inline uint32_t extractFloat32Frac(float32 a)
 140 {
 141     return float32_val(a) & 0x007FFFFF;
 142 }
 143
 144 /*----------------------------------------------------------------------------
 145 | Returns the exponent bits of the single-precision floating-point value `a'.
 146 *----------------------------------------------------------------------------*/
 147
 148 static inline int extractFloat32Exp(float32 a)
 149 {
 150     return (float32_val(a) >> 23) & 0xFF;
 151 }
 152
 153 /*----------------------------------------------------------------------------
 154 | Returns the sign bit of the single-precision floating-point value `a'.
 155 *----------------------------------------------------------------------------*/
 156
 157 static inline flag extractFloat32Sign(float32 a)
 158 {
 159     return float32_val(a) >> 31;
 160 }
 161
 162 /*----------------------------------------------------------------------------
 163 | Returns the fraction bits of the double-precision floating-point value `a'.
 164 *----------------------------------------------------------------------------*/
 165
 166 static inline uint64_t extractFloat64Frac(float64 a)
 167 {
 168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 169 }
 170
 171 /*----------------------------------------------------------------------------
 172 | Returns the exponent bits of the double-precision floating-point value `a'.
 173 *----------------------------------------------------------------------------*/
 174
 175 static inline int extractFloat64Exp(float64 a)
 176 {
 177     return (float64_val(a) >> 52) & 0x7FF;
 178 }
 179
 180 /*----------------------------------------------------------------------------
 181 | Returns the sign bit of the double-precision floating-point value `a'.
 182 *----------------------------------------------------------------------------*/
 183
 184 static inline flag extractFloat64Sign(float64 a)
 185 {
 186     return float64_val(a) >> 63;
 187 }
 188
 189 /*
 190  * Classify a floating point number. Everything above float_class_qnan
 191  * is a NaN so cls >= float_class_qnan is any NaN.
 192  */
 193
 194 typedef enum __attribute__ ((__packed__)) {
 195     float_class_unclassified,
 196     float_class_zero,
 197     float_class_normal,
 198     float_class_inf,
 199     float_class_qnan,  /* all NaNs from here */
 200     float_class_snan,
 201     float_class_dnan,
 202     float_class_msnan, /* maybe silenced */
 203 } FloatClass;
 204
 205 /*
 206  * Structure holding all of the decomposed parts of a float. The
 207  * exponent is unbiased and the fraction is normalized. All
 208  * calculations are done with a 64 bit fraction and then rounded as
 209  * appropriate for the final format.
 210  *
 211  * Thanks to the packed FloatClass a decent compiler should be able to
 212  * fit the whole structure into registers and avoid using the stack
 213  * for parameter passing.
 214  */
 215
 216 typedef struct {
 217     uint64_t frac;
 218     int32_t  exp;
 219     FloatClass cls;
 220     bool sign;
 221 } FloatParts;
 222
 223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 226
 227 /* Structure holding all of the relevant parameters for a format.
 228  *   exp_size: the size of the exponent field
 229  *   exp_bias: the offset applied to the exponent field
 230  *   exp_max: the maximum normalised exponent
 231  *   frac_size: the size of the fraction field
 232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 233  * The following are computed based the size of fraction
 234  *   frac_lsb: least significant bit of fraction
 235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
 236  *   round_mask/roundeven_mask: masks used for rounding
 237  */
 238 typedef struct {
 239     int exp_size;
 240     int exp_bias;
 241     int exp_max;
 242     int frac_size;
 243     int frac_shift;
 244     uint64_t frac_lsb;
 245     uint64_t frac_lsbm1;
 246     uint64_t round_mask;
 247     uint64_t roundeven_mask;
 248 } FloatFmt;
 249
 250 /* Expand fields based on the size of exponent and fraction */
 251 #define FLOAT_PARAMS(E, F)                                           \
 252     .exp_size       = E,                                             \
 253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 254     .exp_max        = (1 << E) - 1,                                  \
 255     .frac_size      = F,                                             \
 256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 261
 262 static const FloatFmt float16_params = {
 263     FLOAT_PARAMS(5, 10)
 264 };
 265
 266 static const FloatFmt float32_params = {
 267     FLOAT_PARAMS(8, 23)
 268 };
 269
 270 static const FloatFmt float64_params = {
 271     FLOAT_PARAMS(11, 52)
 272 };
 273
 274 /* Unpack a float to parts, but do not canonicalize.  */
 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 276 {
 277     const int sign_pos = fmt.frac_size + fmt.exp_size;
 278
 279     return (FloatParts) {
 280         .cls = float_class_unclassified,
 281         .sign = extract64(raw, sign_pos, 1),
 282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 283         .frac = extract64(raw, 0, fmt.frac_size),
 284     };
 285 }
 286
 287 static inline FloatParts float16_unpack_raw(float16 f)
 288 {
 289     return unpack_raw(float16_params, f);
 290 }
 291
 292 static inline FloatParts float32_unpack_raw(float32 f)
 293 {
 294     return unpack_raw(float32_params, f);
 295 }
 296
 297 static inline FloatParts float64_unpack_raw(float64 f)
 298 {
 299     return unpack_raw(float64_params, f);
 300 }
 301
 302 /* Pack a float from parts, but do not canonicalize.  */
 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 304 {
 305     const int sign_pos = fmt.frac_size + fmt.exp_size;
 306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 307     return deposit64(ret, sign_pos, 1, p.sign);
 308 }
 309
 310 static inline float16 float16_pack_raw(FloatParts p)
 311 {
 312     return make_float16(pack_raw(float16_params, p));
 313 }
 314
 315 static inline float32 float32_pack_raw(FloatParts p)
 316 {
 317     return make_float32(pack_raw(float32_params, p));
 318 }
 319
 320 static inline float64 float64_pack_raw(FloatParts p)
 321 {
 322     return make_float64(pack_raw(float64_params, p));
 323 }
 324
 325 /* Canonicalize EXP and FRAC, setting CLS.  */
 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
 327                                float_status *status)
 328 {
 329     if (part.exp == parm->exp_max) {
 330         if (part.frac == 0) {
 331             part.cls = float_class_inf;
 332         } else {
 333 #ifdef NO_SIGNALING_NANS
 334             part.cls = float_class_qnan;
 335 #else
 336             int64_t msb = part.frac << (parm->frac_shift + 2);
 337             if ((msb < 0) == status->snan_bit_is_one) {
 338                 part.cls = float_class_snan;
 339             } else {
 340                 part.cls = float_class_qnan;
 341             }
 342 #endif
 343         }
 344     } else if (part.exp == 0) {
 345         if (likely(part.frac == 0)) {
 346             part.cls = float_class_zero;
 347         } else if (status->flush_inputs_to_zero) {
 348             float_raise(float_flag_input_denormal, status);
 349             part.cls = float_class_zero;
 350             part.frac = 0;
 351         } else {
 352             int shift = clz64(part.frac) - 1;
 353             part.cls = float_class_normal;
 354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 355             part.frac <<= shift;
 356         }
 357     } else {
 358         part.cls = float_class_normal;
 359         part.exp -= parm->exp_bias;
 360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 361     }
 362     return part;
 363 }
 364
 365 /* Round and uncanonicalize a floating-point number by parts. There
 366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 367  * fraction; these bits will be removed. The exponent will be biased
 368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 369  */
 370
 371 static FloatParts round_canonical(FloatParts p, float_status *s,
 372                                   const FloatFmt *parm)
 373 {
 374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 375     const uint64_t round_mask = parm->round_mask;
 376     const uint64_t roundeven_mask = parm->roundeven_mask;
 377     const int exp_max = parm->exp_max;
 378     const int frac_shift = parm->frac_shift;
 379     uint64_t frac, inc;
 380     int exp, flags = 0;
 381     bool overflow_norm;
 382
 383     frac = p.frac;
 384     exp = p.exp;
 385
 386     switch (p.cls) {
 387     case float_class_normal:
 388         switch (s->float_rounding_mode) {
 389         case float_round_nearest_even:
 390             overflow_norm = false;
 391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 392             break;
 393         case float_round_ties_away:
 394             overflow_norm = false;
 395             inc = frac_lsbm1;
 396             break;
 397         case float_round_to_zero:
 398             overflow_norm = true;
 399             inc = 0;
 400             break;
 401         case float_round_up:
 402             inc = p.sign ? 0 : round_mask;
 403             overflow_norm = p.sign;
 404             break;
 405         case float_round_down:
 406             inc = p.sign ? round_mask : 0;
 407             overflow_norm = !p.sign;
 408             break;
 409         default:
 410             g_assert_not_reached();
 411         }
 412
 413         exp += parm->exp_bias;
 414         if (likely(exp > 0)) {
 415             if (frac & round_mask) {
 416                 flags |= float_flag_inexact;
 417                 frac += inc;
 418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
 419                     frac >>= 1;
 420                     exp++;
 421                 }
 422             }
 423             frac >>= frac_shift;
 424
 425             if (unlikely(exp >= exp_max)) {
 426                 flags |= float_flag_overflow | float_flag_inexact;
 427                 if (overflow_norm) {
 428                     exp = exp_max - 1;
 429                     frac = -1;
 430                 } else {
 431                     p.cls = float_class_inf;
 432                     goto do_inf;
 433                 }
 434             }
 435         } else if (s->flush_to_zero) {
 436             flags |= float_flag_output_denormal;
 437             p.cls = float_class_zero;
 438             goto do_zero;
 439         } else {
 440             bool is_tiny = (s->float_detect_tininess
 441                             == float_tininess_before_rounding)
 442                         || (exp < 0)
 443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 444
 445             shift64RightJamming(frac, 1 - exp, &frac);
 446             if (frac & round_mask) {
 447                 /* Need to recompute round-to-even.  */
 448                 if (s->float_rounding_mode == float_round_nearest_even) {
 449                     inc = ((frac & roundeven_mask) != frac_lsbm1
 450                            ? frac_lsbm1 : 0);
 451                 }
 452                 flags |= float_flag_inexact;
 453                 frac += inc;
 454             }
 455
 456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 457             frac >>= frac_shift;
 458
 459             if (is_tiny && (flags & float_flag_inexact)) {
 460                 flags |= float_flag_underflow;
 461             }
 462             if (exp == 0 && frac == 0) {
 463                 p.cls = float_class_zero;
 464             }
 465         }
 466         break;
 467
 468     case float_class_zero:
 469     do_zero:
 470         exp = 0;
 471         frac = 0;
 472         break;
 473
 474     case float_class_inf:
 475     do_inf:
 476         exp = exp_max;
 477         frac = 0;
 478         break;
 479
 480     case float_class_qnan:
 481     case float_class_snan:
 482         exp = exp_max;
 483         break;
 484
 485     default:
 486         g_assert_not_reached();
 487     }
 488
 489     float_raise(flags, s);
 490     p.exp = exp;
 491     p.frac = frac;
 492     return p;
 493 }
 494
 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 496 {
 497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
 498 }
 499
 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 501 {
 502     switch (p.cls) {
 503     case float_class_dnan:
 504         return float16_default_nan(s);
 505     case float_class_msnan:
 506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
 507     default:
 508         p = round_canonical(p, s, &float16_params);
 509         return float16_pack_raw(p);
 510     }
 511 }
 512
 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 514 {
 515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
 516 }
 517
 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 519 {
 520     switch (p.cls) {
 521     case float_class_dnan:
 522         return float32_default_nan(s);
 523     case float_class_msnan:
 524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
 525     default:
 526         p = round_canonical(p, s, &float32_params);
 527         return float32_pack_raw(p);
 528     }
 529 }
 530
 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 532 {
 533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
 534 }
 535
 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 537 {
 538     switch (p.cls) {
 539     case float_class_dnan:
 540         return float64_default_nan(s);
 541     case float_class_msnan:
 542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
 543     default:
 544         p = round_canonical(p, s, &float64_params);
 545         return float64_pack_raw(p);
 546     }
 547 }
 548
 549 /* Simple helpers for checking if what NaN we have */
 550 static bool is_nan(FloatClass c)
 551 {
 552     return unlikely(c >= float_class_qnan);
 553 }
 554 static bool is_snan(FloatClass c)
 555 {
 556     return c == float_class_snan;
 557 }
 558 static bool is_qnan(FloatClass c)
 559 {
 560     return c == float_class_qnan;
 561 }
 562
 563 static FloatParts return_nan(FloatParts a, float_status *s)
 564 {
 565     switch (a.cls) {
 566     case float_class_snan:
 567         s->float_exception_flags |= float_flag_invalid;
 568         a.cls = float_class_msnan;
 569         /* fall through */
 570     case float_class_qnan:
 571         if (s->default_nan_mode) {
 572             a.cls = float_class_dnan;
 573         }
 574         break;
 575
 576     default:
 577         g_assert_not_reached();
 578     }
 579     return a;
 580 }
 581
 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 583 {
 584     if (is_snan(a.cls) || is_snan(b.cls)) {
 585         s->float_exception_flags |= float_flag_invalid;
 586     }
 587
 588     if (s->default_nan_mode) {
 589         a.cls = float_class_dnan;
 590     } else {
 591         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
 592                     is_qnan(b.cls), is_snan(b.cls),
 593                     a.frac > b.frac ||
 594                     (a.frac == b.frac && a.sign < b.sign))) {
 595             a = b;
 596         }
 597         a.cls = float_class_msnan;
 598     }
 599     return a;
 600 }
 601
 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 603                                   bool inf_zero, float_status *s)
 604 {
 605     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 606         s->float_exception_flags |= float_flag_invalid;
 607     }
 608
 609     if (s->default_nan_mode) {
 610         a.cls = float_class_dnan;
 611     } else {
 612         switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
 613                               is_qnan(b.cls), is_snan(b.cls),
 614                               is_qnan(c.cls), is_snan(c.cls),
 615                               inf_zero, s)) {
 616         case 0:
 617             break;
 618         case 1:
 619             a = b;
 620             break;
 621         case 2:
 622             a = c;
 623             break;
 624         case 3:
 625             a.cls = float_class_dnan;
 626             return a;
 627         default:
 628             g_assert_not_reached();
 629         }
 630
 631         a.cls = float_class_msnan;
 632     }
 633     return a;
 634 }
 635
 636 /*
 637  * Returns the result of adding or subtracting the values of the
 638  * floating-point values `a' and `b'. The operation is performed
 639  * according to the IEC/IEEE Standard for Binary Floating-Point
 640  * Arithmetic.
 641  */
 642
 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 644                                 float_status *s)
 645 {
 646     bool a_sign = a.sign;
 647     bool b_sign = b.sign ^ subtract;
 648
 649     if (a_sign != b_sign) {
 650         /* Subtraction */
 651
 652         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 653             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 654                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 655                 a.frac = a.frac - b.frac;
 656             } else {
 657                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 658                 a.frac = b.frac - a.frac;
 659                 a.exp = b.exp;
 660                 a_sign ^= 1;
 661             }
 662
 663             if (a.frac == 0) {
 664                 a.cls = float_class_zero;
 665                 a.sign = s->float_rounding_mode == float_round_down;
 666             } else {
 667                 int shift = clz64(a.frac) - 1;
 668                 a.frac = a.frac << shift;
 669                 a.exp = a.exp - shift;
 670                 a.sign = a_sign;
 671             }
 672             return a;
 673         }
 674         if (is_nan(a.cls) || is_nan(b.cls)) {
 675             return pick_nan(a, b, s);
 676         }
 677         if (a.cls == float_class_inf) {
 678             if (b.cls == float_class_inf) {
 679                 float_raise(float_flag_invalid, s);
 680                 a.cls = float_class_dnan;
 681             }
 682             return a;
 683         }
 684         if (a.cls == float_class_zero && b.cls == float_class_zero) {
 685             a.sign = s->float_rounding_mode == float_round_down;
 686             return a;
 687         }
 688         if (a.cls == float_class_zero || b.cls == float_class_inf) {
 689             b.sign = a_sign ^ 1;
 690             return b;
 691         }
 692         if (b.cls == float_class_zero) {
 693             return a;
 694         }
 695     } else {
 696         /* Addition */
 697         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 698             if (a.exp > b.exp) {
 699                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 700             } else if (a.exp < b.exp) {
 701                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 702                 a.exp = b.exp;
 703             }
 704             a.frac += b.frac;
 705             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
 706                 a.frac >>= 1;
 707                 a.exp += 1;
 708             }
 709             return a;
 710         }
 711         if (is_nan(a.cls) || is_nan(b.cls)) {
 712             return pick_nan(a, b, s);
 713         }
 714         if (a.cls == float_class_inf || b.cls == float_class_zero) {
 715             return a;
 716         }
 717         if (b.cls == float_class_inf || a.cls == float_class_zero) {
 718             b.sign = b_sign;
 719             return b;
 720         }
 721     }
 722     g_assert_not_reached();
 723 }
 724
 725 /*
 726  * Returns the result of adding or subtracting the floating-point
 727  * values `a' and `b'. The operation is performed according to the
 728  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 729  */
 730
 731 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
 732                                               float_status *status)
 733 {
 734     FloatParts pa = float16_unpack_canonical(a, status);
 735     FloatParts pb = float16_unpack_canonical(b, status);
 736     FloatParts pr = addsub_floats(pa, pb, false, status);
 737
 738     return float16_round_pack_canonical(pr, status);
 739 }
 740
 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
 742                                              float_status *status)
 743 {
 744     FloatParts pa = float32_unpack_canonical(a, status);
 745     FloatParts pb = float32_unpack_canonical(b, status);
 746     FloatParts pr = addsub_floats(pa, pb, false, status);
 747
 748     return float32_round_pack_canonical(pr, status);
 749 }
 750
 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
 752                                              float_status *status)
 753 {
 754     FloatParts pa = float64_unpack_canonical(a, status);
 755     FloatParts pb = float64_unpack_canonical(b, status);
 756     FloatParts pr = addsub_floats(pa, pb, false, status);
 757
 758     return float64_round_pack_canonical(pr, status);
 759 }
 760
 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
 762                                              float_status *status)
 763 {
 764     FloatParts pa = float16_unpack_canonical(a, status);
 765     FloatParts pb = float16_unpack_canonical(b, status);
 766     FloatParts pr = addsub_floats(pa, pb, true, status);
 767
 768     return float16_round_pack_canonical(pr, status);
 769 }
 770
 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
 772                                              float_status *status)
 773 {
 774     FloatParts pa = float32_unpack_canonical(a, status);
 775     FloatParts pb = float32_unpack_canonical(b, status);
 776     FloatParts pr = addsub_floats(pa, pb, true, status);
 777
 778     return float32_round_pack_canonical(pr, status);
 779 }
 780
 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
 782                                              float_status *status)
 783 {
 784     FloatParts pa = float64_unpack_canonical(a, status);
 785     FloatParts pb = float64_unpack_canonical(b, status);
 786     FloatParts pr = addsub_floats(pa, pb, true, status);
 787
 788     return float64_round_pack_canonical(pr, status);
 789 }
 790
 791 /*
 792  * Returns the result of multiplying the floating-point values `a' and
 793  * `b'. The operation is performed according to the IEC/IEEE Standard
 794  * for Binary Floating-Point Arithmetic.
 795  */
 796
 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
 798 {
 799     bool sign = a.sign ^ b.sign;
 800
 801     if (a.cls == float_class_normal && b.cls == float_class_normal) {
 802         uint64_t hi, lo;
 803         int exp = a.exp + b.exp;
 804
 805         mul64To128(a.frac, b.frac, &hi, &lo);
 806         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 807         if (lo & DECOMPOSED_OVERFLOW_BIT) {
 808             shift64RightJamming(lo, 1, &lo);
 809             exp += 1;
 810         }
 811
 812         /* Re-use a */
 813         a.exp = exp;
 814         a.sign = sign;
 815         a.frac = lo;
 816         return a;
 817     }
 818     /* handle all the NaN cases */
 819     if (is_nan(a.cls) || is_nan(b.cls)) {
 820         return pick_nan(a, b, s);
 821     }
 822     /* Inf * Zero == NaN */
 823     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
 824         (a.cls == float_class_zero && b.cls == float_class_inf)) {
 825         s->float_exception_flags |= float_flag_invalid;
 826         a.cls = float_class_dnan;
 827         a.sign = sign;
 828         return a;
 829     }
 830     /* Multiply by 0 or Inf */
 831     if (a.cls == float_class_inf || a.cls == float_class_zero) {
 832         a.sign = sign;
 833         return a;
 834     }
 835     if (b.cls == float_class_inf || b.cls == float_class_zero) {
 836         b.sign = sign;
 837         return b;
 838     }
 839     g_assert_not_reached();
 840 }
 841
 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
 843                                              float_status *status)
 844 {
 845     FloatParts pa = float16_unpack_canonical(a, status);
 846     FloatParts pb = float16_unpack_canonical(b, status);
 847     FloatParts pr = mul_floats(pa, pb, status);
 848
 849     return float16_round_pack_canonical(pr, status);
 850 }
 851
 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
 853                                              float_status *status)
 854 {
 855     FloatParts pa = float32_unpack_canonical(a, status);
 856     FloatParts pb = float32_unpack_canonical(b, status);
 857     FloatParts pr = mul_floats(pa, pb, status);
 858
 859     return float32_round_pack_canonical(pr, status);
 860 }
 861
 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
 863                                              float_status *status)
 864 {
 865     FloatParts pa = float64_unpack_canonical(a, status);
 866     FloatParts pb = float64_unpack_canonical(b, status);
 867     FloatParts pr = mul_floats(pa, pb, status);
 868
 869     return float64_round_pack_canonical(pr, status);
 870 }
 871
 872 /*
 873  * Returns the result of multiplying the floating-point values `a' and
 874  * `b' then adding 'c', with no intermediate rounding step after the
 875  * multiplication. The operation is performed according to the
 876  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 877  * The flags argument allows the caller to select negation of the
 878  * addend, the intermediate product, or the final result. (The
 879  * difference between this and having the caller do a separate
 880  * negation is that negating externally will flip the sign bit on
 881  * NaNs.)
 882  */
 883
 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
 885                                 int flags, float_status *s)
 886 {
 887     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
 888                     ((1 << float_class_inf) | (1 << float_class_zero));
 889     bool p_sign;
 890     bool sign_flip = flags & float_muladd_negate_result;
 891     FloatClass p_class;
 892     uint64_t hi, lo;
 893     int p_exp;
 894
 895     /* It is implementation-defined whether the cases of (0,inf,qnan)
 896      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
 897      * they return if they do), so we have to hand this information
 898      * off to the target-specific pick-a-NaN routine.
 899      */
 900     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
 901         return pick_nan_muladd(a, b, c, inf_zero, s);
 902     }
 903
 904     if (inf_zero) {
 905         s->float_exception_flags |= float_flag_invalid;
 906         a.cls = float_class_dnan;
 907         return a;
 908     }
 909
 910     if (flags & float_muladd_negate_c) {
 911         c.sign ^= 1;
 912     }
 913
 914     p_sign = a.sign ^ b.sign;
 915
 916     if (flags & float_muladd_negate_product) {
 917         p_sign ^= 1;
 918     }
 919
 920     if (a.cls == float_class_inf || b.cls == float_class_inf) {
 921         p_class = float_class_inf;
 922     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
 923         p_class = float_class_zero;
 924     } else {
 925         p_class = float_class_normal;
 926     }
 927
 928     if (c.cls == float_class_inf) {
 929         if (p_class == float_class_inf && p_sign != c.sign) {
 930             s->float_exception_flags |= float_flag_invalid;
 931             a.cls = float_class_dnan;
 932         } else {
 933             a.cls = float_class_inf;
 934             a.sign = c.sign ^ sign_flip;
 935         }
 936         return a;
 937     }
 938
 939     if (p_class == float_class_inf) {
 940         a.cls = float_class_inf;
 941         a.sign = p_sign ^ sign_flip;
 942         return a;
 943     }
 944
 945     if (p_class == float_class_zero) {
 946         if (c.cls == float_class_zero) {
 947             if (p_sign != c.sign) {
 948                 p_sign = s->float_rounding_mode == float_round_down;
 949             }
 950             c.sign = p_sign;
 951         } else if (flags & float_muladd_halve_result) {
 952             c.exp -= 1;
 953         }
 954         c.sign ^= sign_flip;
 955         return c;
 956     }
 957
 958     /* a & b should be normals now... */
 959     assert(a.cls == float_class_normal &&
 960            b.cls == float_class_normal);
 961
 962     p_exp = a.exp + b.exp;
 963
 964     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
 965      * result.
 966      */
 967     mul64To128(a.frac, b.frac, &hi, &lo);
 968     /* binary point now at bit 124 */
 969
 970     /* check for overflow */
 971     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
 972         shift128RightJamming(hi, lo, 1, &hi, &lo);
 973         p_exp += 1;
 974     }
 975
 976     /* + add/sub */
 977     if (c.cls == float_class_zero) {
 978         /* move binary point back to 62 */
 979         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 980     } else {
 981         int exp_diff = p_exp - c.exp;
 982         if (p_sign == c.sign) {
 983             /* Addition */
 984             if (exp_diff <= 0) {
 985                 shift128RightJamming(hi, lo,
 986                                      DECOMPOSED_BINARY_POINT - exp_diff,
 987                                      &hi, &lo);
 988                 lo += c.frac;
 989                 p_exp = c.exp;
 990             } else {
 991                 uint64_t c_hi, c_lo;
 992                 /* shift c to the same binary point as the product (124) */
 993                 c_hi = c.frac >> 2;
 994                 c_lo = 0;
 995                 shift128RightJamming(c_hi, c_lo,
 996                                      exp_diff,
 997                                      &c_hi, &c_lo);
 998                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
 999                 /* move binary point back to 62 */
1000                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001             }
1002
1003             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004                 shift64RightJamming(lo, 1, &lo);
1005                 p_exp += 1;
1006             }
1007
1008         } else {
1009             /* Subtraction */
1010             uint64_t c_hi, c_lo;
1011             /* make C binary point match product at bit 124 */
1012             c_hi = c.frac >> 2;
1013             c_lo = 0;
1014
1015             if (exp_diff <= 0) {
1016                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017                 if (exp_diff == 0
1018                     &&
1019                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021                 } else {
1022                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023                     p_sign ^= 1;
1024                     p_exp = c.exp;
1025                 }
1026             } else {
1027                 shift128RightJamming(c_hi, c_lo,
1028                                      exp_diff,
1029                                      &c_hi, &c_lo);
1030                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031             }
1032
1033             if (hi == 0 && lo == 0) {
1034                 a.cls = float_class_zero;
1035                 a.sign = s->float_rounding_mode == float_round_down;
1036                 a.sign ^= sign_flip;
1037                 return a;
1038             } else {
1039                 int shift;
1040                 if (hi != 0) {
1041                     shift = clz64(hi);
1042                 } else {
1043                     shift = clz64(lo) + 64;
1044                 }
1045                 /* Normalizing to a binary point of 124 is the
1046                    correct adjust for the exponent.  However since we're
1047                    shifting, we might as well put the binary point back
1048                    at 62 where we really want it.  Therefore shift as
1049                    if we're leaving 1 bit at the top of the word, but
1050                    adjust the exponent as if we're leaving 3 bits.  */
1051                 shift -= 1;
1052                 if (shift >= 64) {
1053                     lo = lo << (shift - 64);
1054                 } else {
1055                     hi = (hi << shift) | (lo >> (64 - shift));
1056                     lo = hi | ((lo << shift) != 0);
1057                 }
1058                 p_exp -= shift - 2;
1059             }
1060         }
1061     }
1062
1063     if (flags & float_muladd_halve_result) {
1064         p_exp -= 1;
1065     }
1066
1067     /* finally prepare our result */
1068     a.cls = float_class_normal;
1069     a.sign = p_sign ^ sign_flip;
1070     a.exp = p_exp;
1071     a.frac = lo;
1072
1073     return a;
1074 }
1075
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077                                                 int flags, float_status *status)
1078 {
1079     FloatParts pa = float16_unpack_canonical(a, status);
1080     FloatParts pb = float16_unpack_canonical(b, status);
1081     FloatParts pc = float16_unpack_canonical(c, status);
1082     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083
1084     return float16_round_pack_canonical(pr, status);
1085 }
1086
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088                                                 int flags, float_status *status)
1089 {
1090     FloatParts pa = float32_unpack_canonical(a, status);
1091     FloatParts pb = float32_unpack_canonical(b, status);
1092     FloatParts pc = float32_unpack_canonical(c, status);
1093     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094
1095     return float32_round_pack_canonical(pr, status);
1096 }
1097
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099                                                 int flags, float_status *status)
1100 {
1101     FloatParts pa = float64_unpack_canonical(a, status);
1102     FloatParts pb = float64_unpack_canonical(b, status);
1103     FloatParts pc = float64_unpack_canonical(c, status);
1104     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105
1106     return float64_round_pack_canonical(pr, status);
1107 }
1108
1109 /*
1110  * Returns the result of dividing the floating-point value `a' by the
1111  * corresponding value `b'. The operation is performed according to
1112  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113  */
1114
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116 {
1117     bool sign = a.sign ^ b.sign;
1118
1119     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120         uint64_t temp_lo, temp_hi;
1121         int exp = a.exp - b.exp;
1122         if (a.frac < b.frac) {
1123             exp -= 1;
1124             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125                               &temp_hi, &temp_lo);
1126         } else {
1127             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128                               &temp_hi, &temp_lo);
1129         }
1130         /* LSB of quot is set if inexact which roundandpack will use
1131          * to set flags. Yet again we re-use a for the result */
1132         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133         a.sign = sign;
1134         a.exp = exp;
1135         return a;
1136     }
1137     /* handle all the NaN cases */
1138     if (is_nan(a.cls) || is_nan(b.cls)) {
1139         return pick_nan(a, b, s);
1140     }
1141     /* 0/0 or Inf/Inf */
1142     if (a.cls == b.cls
1143         &&
1144         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145         s->float_exception_flags |= float_flag_invalid;
1146         a.cls = float_class_dnan;
1147         return a;
1148     }
1149     /* Div 0 => Inf */
1150     if (b.cls == float_class_zero) {
1151         s->float_exception_flags |= float_flag_divbyzero;
1152         a.cls = float_class_inf;
1153         a.sign = sign;
1154         return a;
1155     }
1156     /* Inf / x or 0 / x */
1157     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158         a.sign = sign;
1159         return a;
1160     }
1161     /* Div by Inf */
1162     if (b.cls == float_class_inf) {
1163         a.cls = float_class_zero;
1164         a.sign = sign;
1165         return a;
1166     }
1167     g_assert_not_reached();
1168 }
1169
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1171 {
1172     FloatParts pa = float16_unpack_canonical(a, status);
1173     FloatParts pb = float16_unpack_canonical(b, status);
1174     FloatParts pr = div_floats(pa, pb, status);
1175
1176     return float16_round_pack_canonical(pr, status);
1177 }
1178
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1180 {
1181     FloatParts pa = float32_unpack_canonical(a, status);
1182     FloatParts pb = float32_unpack_canonical(b, status);
1183     FloatParts pr = div_floats(pa, pb, status);
1184
1185     return float32_round_pack_canonical(pr, status);
1186 }
1187
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1189 {
1190     FloatParts pa = float64_unpack_canonical(a, status);
1191     FloatParts pb = float64_unpack_canonical(b, status);
1192     FloatParts pr = div_floats(pa, pb, status);
1193
1194     return float64_round_pack_canonical(pr, status);
1195 }
1196
1197 /*
1198  * Rounds the floating-point value `a' to an integer, and returns the
1199  * result as a floating-point value. The operation is performed
1200  * according to the IEC/IEEE Standard for Binary Floating-Point
1201  * Arithmetic.
1202  */
1203
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205 {
1206     if (is_nan(a.cls)) {
1207         return return_nan(a, s);
1208     }
1209
1210     switch (a.cls) {
1211     case float_class_zero:
1212     case float_class_inf:
1213     case float_class_qnan:
1214         /* already "integral" */
1215         break;
1216     case float_class_normal:
1217         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218             /* already integral */
1219             break;
1220         }
1221         if (a.exp < 0) {
1222             bool one;
1223             /* all fractional */
1224             s->float_exception_flags |= float_flag_inexact;
1225             switch (rounding_mode) {
1226             case float_round_nearest_even:
1227                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228                 break;
1229             case float_round_ties_away:
1230                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231                 break;
1232             case float_round_to_zero:
1233                 one = false;
1234                 break;
1235             case float_round_up:
1236                 one = !a.sign;
1237                 break;
1238             case float_round_down:
1239                 one = a.sign;
1240                 break;
1241             default:
1242                 g_assert_not_reached();
1243             }
1244
1245             if (one) {
1246                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247                 a.exp = 0;
1248             } else {
1249                 a.cls = float_class_zero;
1250             }
1251         } else {
1252             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253             uint64_t frac_lsbm1 = frac_lsb >> 1;
1254             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255             uint64_t rnd_mask = rnd_even_mask >> 1;
1256             uint64_t inc;
1257
1258             switch (rounding_mode) {
1259             case float_round_nearest_even:
1260                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261                 break;
1262             case float_round_ties_away:
1263                 inc = frac_lsbm1;
1264                 break;
1265             case float_round_to_zero:
1266                 inc = 0;
1267                 break;
1268             case float_round_up:
1269                 inc = a.sign ? 0 : rnd_mask;
1270                 break;
1271             case float_round_down:
1272                 inc = a.sign ? rnd_mask : 0;
1273                 break;
1274             default:
1275                 g_assert_not_reached();
1276             }
1277
1278             if (a.frac & rnd_mask) {
1279                 s->float_exception_flags |= float_flag_inexact;
1280                 a.frac += inc;
1281                 a.frac &= ~rnd_mask;
1282                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283                     a.frac >>= 1;
1284                     a.exp++;
1285                 }
1286             }
1287         }
1288         break;
1289     default:
1290         g_assert_not_reached();
1291     }
1292     return a;
1293 }
1294
1295 float16 float16_round_to_int(float16 a, float_status *s)
1296 {
1297     FloatParts pa = float16_unpack_canonical(a, s);
1298     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299     return float16_round_pack_canonical(pr, s);
1300 }
1301
1302 float32 float32_round_to_int(float32 a, float_status *s)
1303 {
1304     FloatParts pa = float32_unpack_canonical(a, s);
1305     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306     return float32_round_pack_canonical(pr, s);
1307 }
1308
1309 float64 float64_round_to_int(float64 a, float_status *s)
1310 {
1311     FloatParts pa = float64_unpack_canonical(a, s);
1312     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313     return float64_round_pack_canonical(pr, s);
1314 }
1315
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1317 {
1318     FloatParts pa = float64_unpack_canonical(a, s);
1319     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320     return float64_round_pack_canonical(pr, s);
1321 }
1322
1323 /*
1324  * Returns the result of converting the floating-point value `a' to
1325  * the two's complement integer format. The conversion is performed
1326  * according to the IEC/IEEE Standard for Binary Floating-Point
1327  * Arithmetic---which means in particular that the conversion is
1328  * rounded according to the current rounding mode. If `a' is a NaN,
1329  * the largest positive integer is returned. Otherwise, if the
1330  * conversion overflows, the largest integer with the same sign as `a'
1331  * is returned.
1332 */
1333
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335                                      int64_t min, int64_t max,
1336                                      float_status *s)
1337 {
1338     uint64_t r;
1339     int orig_flags = get_float_exception_flags(s);
1340     FloatParts p = round_to_int(in, rmode, s);
1341
1342     switch (p.cls) {
1343     case float_class_snan:
1344     case float_class_qnan:
1345     case float_class_dnan:
1346     case float_class_msnan:
1347         return max;
1348     case float_class_inf:
1349         return p.sign ? min : max;
1350     case float_class_zero:
1351         return 0;
1352     case float_class_normal:
1353         if (p.exp < DECOMPOSED_BINARY_POINT) {
1354             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1355         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1356             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1357         } else {
1358             r = UINT64_MAX;
1359         }
1360         if (p.sign) {
1361             if (r < -(uint64_t) min) {
1362                 return -r;
1363             } else {
1364                 s->float_exception_flags = orig_flags | float_flag_invalid;
1365                 return min;
1366             }
1367         } else {
1368             if (r < max) {
1369                 return r;
1370             } else {
1371                 s->float_exception_flags = orig_flags | float_flag_invalid;
1372                 return max;
1373             }
1374         }
1375     default:
1376         g_assert_not_reached();
1377     }
1378 }
1379
1380 #define FLOAT_TO_INT(fsz, isz)                                          \
1381 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1382                                                 float_status *s)        \
1383 {                                                                       \
1384     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1385     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1386                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1387                                  s);                                    \
1388 }                                                                       \
1389                                                                         \
1390 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1391  (float ## fsz a, float_status *s)                                      \
1392 {                                                                       \
1393     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1394     return round_to_int_and_pack(p, float_round_to_zero,                \
1395                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1396                                  s);                                    \
1397 }
1398
1399 FLOAT_TO_INT(16, 16)
1400 FLOAT_TO_INT(16, 32)
1401 FLOAT_TO_INT(16, 64)
1402
1403 FLOAT_TO_INT(32, 16)
1404 FLOAT_TO_INT(32, 32)
1405 FLOAT_TO_INT(32, 64)
1406
1407 FLOAT_TO_INT(64, 16)
1408 FLOAT_TO_INT(64, 32)
1409 FLOAT_TO_INT(64, 64)
1410
1411 #undef FLOAT_TO_INT
1412
1413 /*
1414  *  Returns the result of converting the floating-point value `a' to
1415  *  the unsigned integer format. The conversion is performed according
1416  *  to the IEC/IEEE Standard for Binary Floating-Point
1417  *  Arithmetic---which means in particular that the conversion is
1418  *  rounded according to the current rounding mode. If `a' is a NaN,
1419  *  the largest unsigned integer is returned. Otherwise, if the
1420  *  conversion overflows, the largest unsigned integer is returned. If
1421  *  the 'a' is negative, the result is rounded and zero is returned;
1422  *  values that do not round to zero will raise the inexact exception
1423  *  flag.
1424  */
1425
1426 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1427                                        float_status *s)
1428 {
1429     int orig_flags = get_float_exception_flags(s);
1430     FloatParts p = round_to_int(in, rmode, s);
1431
1432     switch (p.cls) {
1433     case float_class_snan:
1434     case float_class_qnan:
1435     case float_class_dnan:
1436     case float_class_msnan:
1437         s->float_exception_flags = orig_flags | float_flag_invalid;
1438         return max;
1439     case float_class_inf:
1440         return p.sign ? 0 : max;
1441     case float_class_zero:
1442         return 0;
1443     case float_class_normal:
1444     {
1445         uint64_t r;
1446         if (p.sign) {
1447             s->float_exception_flags = orig_flags | float_flag_invalid;
1448             return 0;
1449         }
1450
1451         if (p.exp < DECOMPOSED_BINARY_POINT) {
1452             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1453         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1454             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1455         } else {
1456             s->float_exception_flags = orig_flags | float_flag_invalid;
1457             return max;
1458         }
1459
1460         /* For uint64 this will never trip, but if p.exp is too large
1461          * to shift a decomposed fraction we shall have exited via the
1462          * 3rd leg above.
1463          */
1464         if (r > max) {
1465             s->float_exception_flags = orig_flags | float_flag_invalid;
1466             return max;
1467         } else {
1468             return r;
1469         }
1470     }
1471     default:
1472         g_assert_not_reached();
1473     }
1474 }
1475
1476 #define FLOAT_TO_UINT(fsz, isz) \
1477 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1478                                                   float_status *s)      \
1479 {                                                                       \
1480     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1481     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1482                                  UINT ## isz ## _MAX, s);               \
1483 }                                                                       \
1484                                                                         \
1485 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1486  (float ## fsz a, float_status *s)                                      \
1487 {                                                                       \
1488     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1489     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1490                                  UINT ## isz ## _MAX, s);               \
1491 }
1492
1493 FLOAT_TO_UINT(16, 16)
1494 FLOAT_TO_UINT(16, 32)
1495 FLOAT_TO_UINT(16, 64)
1496
1497 FLOAT_TO_UINT(32, 16)
1498 FLOAT_TO_UINT(32, 32)
1499 FLOAT_TO_UINT(32, 64)
1500
1501 FLOAT_TO_UINT(64, 16)
1502 FLOAT_TO_UINT(64, 32)
1503 FLOAT_TO_UINT(64, 64)
1504
1505 #undef FLOAT_TO_UINT
1506
1507 /*
1508  * Integer to float conversions
1509  *
1510  * Returns the result of converting the two's complement integer `a'
1511  * to the floating-point format. The conversion is performed according
1512  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1513  */
1514
1515 static FloatParts int_to_float(int64_t a, float_status *status)
1516 {
1517     FloatParts r;
1518     if (a == 0) {
1519         r.cls = float_class_zero;
1520         r.sign = false;
1521     } else if (a == (1ULL << 63)) {
1522         r.cls = float_class_normal;
1523         r.sign = true;
1524         r.frac = DECOMPOSED_IMPLICIT_BIT;
1525         r.exp = 63;
1526     } else {
1527         uint64_t f;
1528         if (a < 0) {
1529             f = -a;
1530             r.sign = true;
1531         } else {
1532             f = a;
1533             r.sign = false;
1534         }
1535         int shift = clz64(f) - 1;
1536         r.cls = float_class_normal;
1537         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1538         r.frac = f << shift;
1539     }
1540
1541     return r;
1542 }
1543
1544 float16 int64_to_float16(int64_t a, float_status *status)
1545 {
1546     FloatParts pa = int_to_float(a, status);
1547     return float16_round_pack_canonical(pa, status);
1548 }
1549
1550 float16 int32_to_float16(int32_t a, float_status *status)
1551 {
1552     return int64_to_float16(a, status);
1553 }
1554
1555 float16 int16_to_float16(int16_t a, float_status *status)
1556 {
1557     return int64_to_float16(a, status);
1558 }
1559
1560 float32 int64_to_float32(int64_t a, float_status *status)
1561 {
1562     FloatParts pa = int_to_float(a, status);
1563     return float32_round_pack_canonical(pa, status);
1564 }
1565
1566 float32 int32_to_float32(int32_t a, float_status *status)
1567 {
1568     return int64_to_float32(a, status);
1569 }
1570
1571 float32 int16_to_float32(int16_t a, float_status *status)
1572 {
1573     return int64_to_float32(a, status);
1574 }
1575
1576 float64 int64_to_float64(int64_t a, float_status *status)
1577 {
1578     FloatParts pa = int_to_float(a, status);
1579     return float64_round_pack_canonical(pa, status);
1580 }
1581
1582 float64 int32_to_float64(int32_t a, float_status *status)
1583 {
1584     return int64_to_float64(a, status);
1585 }
1586
1587 float64 int16_to_float64(int16_t a, float_status *status)
1588 {
1589     return int64_to_float64(a, status);
1590 }
1591
1592
1593 /*
1594  * Unsigned Integer to float conversions
1595  *
1596  * Returns the result of converting the unsigned integer `a' to the
1597  * floating-point format. The conversion is performed according to the
1598  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1599  */
1600
1601 static FloatParts uint_to_float(uint64_t a, float_status *status)
1602 {
1603     FloatParts r = { .sign = false};
1604
1605     if (a == 0) {
1606         r.cls = float_class_zero;
1607     } else {
1608         int spare_bits = clz64(a) - 1;
1609         r.cls = float_class_normal;
1610         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1611         if (spare_bits < 0) {
1612             shift64RightJamming(a, -spare_bits, &a);
1613             r.frac = a;
1614         } else {
1615             r.frac = a << spare_bits;
1616         }
1617     }
1618
1619     return r;
1620 }
1621
1622 float16 uint64_to_float16(uint64_t a, float_status *status)
1623 {
1624     FloatParts pa = uint_to_float(a, status);
1625     return float16_round_pack_canonical(pa, status);
1626 }
1627
1628 float16 uint32_to_float16(uint32_t a, float_status *status)
1629 {
1630     return uint64_to_float16(a, status);
1631 }
1632
1633 float16 uint16_to_float16(uint16_t a, float_status *status)
1634 {
1635     return uint64_to_float16(a, status);
1636 }
1637
1638 float32 uint64_to_float32(uint64_t a, float_status *status)
1639 {
1640     FloatParts pa = uint_to_float(a, status);
1641     return float32_round_pack_canonical(pa, status);
1642 }
1643
1644 float32 uint32_to_float32(uint32_t a, float_status *status)
1645 {
1646     return uint64_to_float32(a, status);
1647 }
1648
1649 float32 uint16_to_float32(uint16_t a, float_status *status)
1650 {
1651     return uint64_to_float32(a, status);
1652 }
1653
1654 float64 uint64_to_float64(uint64_t a, float_status *status)
1655 {
1656     FloatParts pa = uint_to_float(a, status);
1657     return float64_round_pack_canonical(pa, status);
1658 }
1659
1660 float64 uint32_to_float64(uint32_t a, float_status *status)
1661 {
1662     return uint64_to_float64(a, status);
1663 }
1664
1665 float64 uint16_to_float64(uint16_t a, float_status *status)
1666 {
1667     return uint64_to_float64(a, status);
1668 }
1669
1670 /* Float Min/Max */
1671 /* min() and max() functions. These can't be implemented as
1672  * 'compare and pick one input' because that would mishandle
1673  * NaNs and +0 vs -0.
1674  *
1675  * minnum() and maxnum() functions. These are similar to the min()
1676  * and max() functions but if one of the arguments is a QNaN and
1677  * the other is numerical then the numerical argument is returned.
1678  * SNaNs will get quietened before being returned.
1679  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1680  * and maxNum() operations. min() and max() are the typical min/max
1681  * semantics provided by many CPUs which predate that specification.
1682  *
1683  * minnummag() and maxnummag() functions correspond to minNumMag()
1684  * and minNumMag() from the IEEE-754 2008.
1685  */
1686 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1687                                 bool ieee, bool ismag, float_status *s)
1688 {
1689     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1690         if (ieee) {
1691             /* Takes two floating-point values `a' and `b', one of
1692              * which is a NaN, and returns the appropriate NaN
1693              * result. If either `a' or `b' is a signaling NaN,
1694              * the invalid exception is raised.
1695              */
1696             if (is_snan(a.cls) || is_snan(b.cls)) {
1697                 return pick_nan(a, b, s);
1698             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1699                 return b;
1700             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1701                 return a;
1702             }
1703         }
1704         return pick_nan(a, b, s);
1705     } else {
1706         int a_exp, b_exp;
1707         bool a_sign, b_sign;
1708
1709         switch (a.cls) {
1710         case float_class_normal:
1711             a_exp = a.exp;
1712             break;
1713         case float_class_inf:
1714             a_exp = INT_MAX;
1715             break;
1716         case float_class_zero:
1717             a_exp = INT_MIN;
1718             break;
1719         default:
1720             g_assert_not_reached();
1721             break;
1722         }
1723         switch (b.cls) {
1724         case float_class_normal:
1725             b_exp = b.exp;
1726             break;
1727         case float_class_inf:
1728             b_exp = INT_MAX;
1729             break;
1730         case float_class_zero:
1731             b_exp = INT_MIN;
1732             break;
1733         default:
1734             g_assert_not_reached();
1735             break;
1736         }
1737
1738         a_sign = a.sign;
1739         b_sign = b.sign;
1740         if (ismag) {
1741             a_sign = b_sign = 0;
1742         }
1743
1744         if (a_sign == b_sign) {
1745             bool a_less = a_exp < b_exp;
1746             if (a_exp == b_exp) {
1747                 a_less = a.frac < b.frac;
1748             }
1749             return a_sign ^ a_less ^ ismin ? b : a;
1750         } else {
1751             return a_sign ^ ismin ? b : a;
1752         }
1753     }
1754 }
1755
1756 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1757 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1758                                      float_status *s)                   \
1759 {                                                                       \
1760     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1761     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1762     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1763                                                                         \
1764     return float ## sz ## _round_pack_canonical(pr, s);                 \
1765 }
1766
1767 MINMAX(16, min, true, false, false)
1768 MINMAX(16, minnum, true, true, false)
1769 MINMAX(16, minnummag, true, true, true)
1770 MINMAX(16, max, false, false, false)
1771 MINMAX(16, maxnum, false, true, false)
1772 MINMAX(16, maxnummag, false, true, true)
1773
1774 MINMAX(32, min, true, false, false)
1775 MINMAX(32, minnum, true, true, false)
1776 MINMAX(32, minnummag, true, true, true)
1777 MINMAX(32, max, false, false, false)
1778 MINMAX(32, maxnum, false, true, false)
1779 MINMAX(32, maxnummag, false, true, true)
1780
1781 MINMAX(64, min, true, false, false)
1782 MINMAX(64, minnum, true, true, false)
1783 MINMAX(64, minnummag, true, true, true)
1784 MINMAX(64, max, false, false, false)
1785 MINMAX(64, maxnum, false, true, false)
1786 MINMAX(64, maxnummag, false, true, true)
1787
1788 #undef MINMAX
1789
1790 /* Floating point compare */
1791 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1792                           float_status *s)
1793 {
1794     if (is_nan(a.cls) || is_nan(b.cls)) {
1795         if (!is_quiet ||
1796             a.cls == float_class_snan ||
1797             b.cls == float_class_snan) {
1798             s->float_exception_flags |= float_flag_invalid;
1799         }
1800         return float_relation_unordered;
1801     }
1802
1803     if (a.cls == float_class_zero) {
1804         if (b.cls == float_class_zero) {
1805             return float_relation_equal;
1806         }
1807         return b.sign ? float_relation_greater : float_relation_less;
1808     } else if (b.cls == float_class_zero) {
1809         return a.sign ? float_relation_less : float_relation_greater;
1810     }
1811
1812     /* The only really important thing about infinity is its sign. If
1813      * both are infinities the sign marks the smallest of the two.
1814      */
1815     if (a.cls == float_class_inf) {
1816         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1817             return float_relation_equal;
1818         }
1819         return a.sign ? float_relation_less : float_relation_greater;
1820     } else if (b.cls == float_class_inf) {
1821         return b.sign ? float_relation_greater : float_relation_less;
1822     }
1823
1824     if (a.sign != b.sign) {
1825         return a.sign ? float_relation_less : float_relation_greater;
1826     }
1827
1828     if (a.exp == b.exp) {
1829         if (a.frac == b.frac) {
1830             return float_relation_equal;
1831         }
1832         if (a.sign) {
1833             return a.frac > b.frac ?
1834                 float_relation_less : float_relation_greater;
1835         } else {
1836             return a.frac > b.frac ?
1837                 float_relation_greater : float_relation_less;
1838         }
1839     } else {
1840         if (a.sign) {
1841             return a.exp > b.exp ? float_relation_less : float_relation_greater;
1842         } else {
1843             return a.exp > b.exp ? float_relation_greater : float_relation_less;
1844         }
1845     }
1846 }
1847
1848 #define COMPARE(sz)                                                     \
1849 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1850                             float_status *s)                            \
1851 {                                                                       \
1852     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1853     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1854     return compare_floats(pa, pb, false, s);                            \
1855 }                                                                       \
1856 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1857                                   float_status *s)                      \
1858 {                                                                       \
1859     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1860     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1861     return compare_floats(pa, pb, true, s);                             \
1862 }
1863
1864 COMPARE(16)
1865 COMPARE(32)
1866 COMPARE(64)
1867
1868 #undef COMPARE
1869
1870 /* Multiply A by 2 raised to the power N.  */
1871 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1872 {
1873     if (unlikely(is_nan(a.cls))) {
1874         return return_nan(a, s);
1875     }
1876     if (a.cls == float_class_normal) {
1877         a.exp += n;
1878     }
1879     return a;
1880 }
1881
1882 float16 float16_scalbn(float16 a, int n, float_status *status)
1883 {
1884     FloatParts pa = float16_unpack_canonical(a, status);
1885     FloatParts pr = scalbn_decomposed(pa, n, status);
1886     return float16_round_pack_canonical(pr, status);
1887 }
1888
1889 float32 float32_scalbn(float32 a, int n, float_status *status)
1890 {
1891     FloatParts pa = float32_unpack_canonical(a, status);
1892     FloatParts pr = scalbn_decomposed(pa, n, status);
1893     return float32_round_pack_canonical(pr, status);
1894 }
1895
1896 float64 float64_scalbn(float64 a, int n, float_status *status)
1897 {
1898     FloatParts pa = float64_unpack_canonical(a, status);
1899     FloatParts pr = scalbn_decomposed(pa, n, status);
1900     return float64_round_pack_canonical(pr, status);
1901 }
1902
1903 /*
1904  * Square Root
1905  *
1906  * The old softfloat code did an approximation step before zeroing in
1907  * on the final result. However for simpleness we just compute the
1908  * square root by iterating down from the implicit bit to enough extra
1909  * bits to ensure we get a correctly rounded result.
1910  *
1911  * This does mean however the calculation is slower than before,
1912  * especially for 64 bit floats.
1913  */
1914
1915 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1916 {
1917     uint64_t a_frac, r_frac, s_frac;
1918     int bit, last_bit;
1919
1920     if (is_nan(a.cls)) {
1921         return return_nan(a, s);
1922     }
1923     if (a.cls == float_class_zero) {
1924         return a;  /* sqrt(+-0) = +-0 */
1925     }
1926     if (a.sign) {
1927         s->float_exception_flags |= float_flag_invalid;
1928         a.cls = float_class_dnan;
1929         return a;
1930     }
1931     if (a.cls == float_class_inf) {
1932         return a;  /* sqrt(+inf) = +inf */
1933     }
1934
1935     assert(a.cls == float_class_normal);
1936
1937     /* We need two overflow bits at the top. Adding room for that is a
1938      * right shift. If the exponent is odd, we can discard the low bit
1939      * by multiplying the fraction by 2; that's a left shift. Combine
1940      * those and we shift right if the exponent is even.
1941      */
1942     a_frac = a.frac;
1943     if (!(a.exp & 1)) {
1944         a_frac >>= 1;
1945     }
1946     a.exp >>= 1;
1947
1948     /* Bit-by-bit computation of sqrt.  */
1949     r_frac = 0;
1950     s_frac = 0;
1951
1952     /* Iterate from implicit bit down to the 3 extra bits to compute a
1953      * properly rounded result. Remember we've inserted one more bit
1954      * at the top, so these positions are one less.
1955      */
1956     bit = DECOMPOSED_BINARY_POINT - 1;
1957     last_bit = MAX(p->frac_shift - 4, 0);
1958     do {
1959         uint64_t q = 1ULL << bit;
1960         uint64_t t_frac = s_frac + q;
1961         if (t_frac <= a_frac) {
1962             s_frac = t_frac + q;
1963             a_frac -= t_frac;
1964             r_frac += q;
1965         }
1966         a_frac <<= 1;
1967     } while (--bit >= last_bit);
1968
1969     /* Undo the right shift done above. If there is any remaining
1970      * fraction, the result is inexact. Set the sticky bit.
1971      */
1972     a.frac = (r_frac << 1) + (a_frac != 0);
1973
1974     return a;
1975 }
1976
1977 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1978 {
1979     FloatParts pa = float16_unpack_canonical(a, status);
1980     FloatParts pr = sqrt_float(pa, status, &float16_params);
1981     return float16_round_pack_canonical(pr, status);
1982 }
1983
1984 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1985 {
1986     FloatParts pa = float32_unpack_canonical(a, status);
1987     FloatParts pr = sqrt_float(pa, status, &float32_params);
1988     return float32_round_pack_canonical(pr, status);
1989 }
1990
1991 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1992 {
1993     FloatParts pa = float64_unpack_canonical(a, status);
1994     FloatParts pr = sqrt_float(pa, status, &float64_params);
1995     return float64_round_pack_canonical(pr, status);
1996 }
1997
1998
1999 /*----------------------------------------------------------------------------
2000 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2001 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2002 | input.  If `zSign' is 1, the input is negated before being converted to an
2003 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2004 | is simply rounded to an integer, with the inexact exception raised if the
2005 | input cannot be represented exactly as an integer.  However, if the fixed-
2006 | point input is too large, the invalid exception is raised and the largest
2007 | positive or negative integer is returned.
2008 *----------------------------------------------------------------------------*/
2009
2010 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2011 {
2012     int8_t roundingMode;
2013     flag roundNearestEven;
2014     int8_t roundIncrement, roundBits;
2015     int32_t z;
2016
2017     roundingMode = status->float_rounding_mode;
2018     roundNearestEven = ( roundingMode == float_round_nearest_even );
2019     switch (roundingMode) {
2020     case float_round_nearest_even:
2021     case float_round_ties_away:
2022         roundIncrement = 0x40;
2023         break;
2024     case float_round_to_zero:
2025         roundIncrement = 0;
2026         break;
2027     case float_round_up:
2028         roundIncrement = zSign ? 0 : 0x7f;
2029         break;
2030     case float_round_down:
2031         roundIncrement = zSign ? 0x7f : 0;
2032         break;
2033     default:
2034         abort();
2035     }
2036     roundBits = absZ & 0x7F;
2037     absZ = ( absZ + roundIncrement )>>7;
2038     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2039     z = absZ;
2040     if ( zSign ) z = - z;
2041     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2042         float_raise(float_flag_invalid, status);
2043         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2044     }
2045     if (roundBits) {
2046         status->float_exception_flags |= float_flag_inexact;
2047     }
2048     return z;
2049
2050 }
2051
2052 /*----------------------------------------------------------------------------
2053 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2054 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2055 | and returns the properly rounded 64-bit integer corresponding to the input.
2056 | If `zSign' is 1, the input is negated before being converted to an integer.
2057 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2058 | the inexact exception raised if the input cannot be represented exactly as
2059 | an integer.  However, if the fixed-point input is too large, the invalid
2060 | exception is raised and the largest positive or negative integer is
2061 | returned.
2062 *----------------------------------------------------------------------------*/
2063
2064 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2065                                float_status *status)
2066 {
2067     int8_t roundingMode;
2068     flag roundNearestEven, increment;
2069     int64_t z;
2070
2071     roundingMode = status->float_rounding_mode;
2072     roundNearestEven = ( roundingMode == float_round_nearest_even );
2073     switch (roundingMode) {
2074     case float_round_nearest_even:
2075     case float_round_ties_away:
2076         increment = ((int64_t) absZ1 < 0);
2077         break;
2078     case float_round_to_zero:
2079         increment = 0;
2080         break;
2081     case float_round_up:
2082         increment = !zSign && absZ1;
2083         break;
2084     case float_round_down:
2085         increment = zSign && absZ1;
2086         break;
2087     default:
2088         abort();
2089     }
2090     if ( increment ) {
2091         ++absZ0;
2092         if ( absZ0 == 0 ) goto overflow;
2093         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2094     }
2095     z = absZ0;
2096     if ( zSign ) z = - z;
2097     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2098  overflow:
2099         float_raise(float_flag_invalid, status);
2100         return
2101               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2102             : LIT64( 0x7FFFFFFFFFFFFFFF );
2103     }
2104     if (absZ1) {
2105         status->float_exception_flags |= float_flag_inexact;
2106     }
2107     return z;
2108
2109 }
2110
2111 /*----------------------------------------------------------------------------
2112 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2113 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2114 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2115 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2116 | with the inexact exception raised if the input cannot be represented exactly
2117 | as an integer.  However, if the fixed-point input is too large, the invalid
2118 | exception is raised and the largest unsigned integer is returned.
2119 *----------------------------------------------------------------------------*/
2120
2121 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2122                                 uint64_t absZ1, float_status *status)
2123 {
2124     int8_t roundingMode;
2125     flag roundNearestEven, increment;
2126
2127     roundingMode = status->float_rounding_mode;
2128     roundNearestEven = (roundingMode == float_round_nearest_even);
2129     switch (roundingMode) {
2130     case float_round_nearest_even:
2131     case float_round_ties_away:
2132         increment = ((int64_t)absZ1 < 0);
2133         break;
2134     case float_round_to_zero:
2135         increment = 0;
2136         break;
2137     case float_round_up:
2138         increment = !zSign && absZ1;
2139         break;
2140     case float_round_down:
2141         increment = zSign && absZ1;
2142         break;
2143     default:
2144         abort();
2145     }
2146     if (increment) {
2147         ++absZ0;
2148         if (absZ0 == 0) {
2149             float_raise(float_flag_invalid, status);
2150             return LIT64(0xFFFFFFFFFFFFFFFF);
2151         }
2152         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2153     }
2154
2155     if (zSign && absZ0) {
2156         float_raise(float_flag_invalid, status);
2157         return 0;
2158     }
2159
2160     if (absZ1) {
2161         status->float_exception_flags |= float_flag_inexact;
2162     }
2163     return absZ0;
2164 }
2165
2166 /*----------------------------------------------------------------------------
2167 | If `a' is denormal and we are in flush-to-zero mode then set the
2168 | input-denormal exception and return zero. Otherwise just return the value.
2169 *----------------------------------------------------------------------------*/
2170 float32 float32_squash_input_denormal(float32 a, float_status *status)
2171 {
2172     if (status->flush_inputs_to_zero) {
2173         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2174             float_raise(float_flag_input_denormal, status);
2175             return make_float32(float32_val(a) & 0x80000000);
2176         }
2177     }
2178     return a;
2179 }
2180
2181 /*----------------------------------------------------------------------------
2182 | Normalizes the subnormal single-precision floating-point value represented
2183 | by the denormalized significand `aSig'.  The normalized exponent and
2184 | significand are stored at the locations pointed to by `zExpPtr' and
2185 | `zSigPtr', respectively.
2186 *----------------------------------------------------------------------------*/
2187
2188 static void
2189  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2190 {
2191     int8_t shiftCount;
2192
2193     shiftCount = countLeadingZeros32( aSig ) - 8;
2194     *zSigPtr = aSig<<shiftCount;
2195     *zExpPtr = 1 - shiftCount;
2196
2197 }
2198
2199 /*----------------------------------------------------------------------------
2200 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2201 | and significand `zSig', and returns the proper single-precision floating-
2202 | point value corresponding to the abstract input.  Ordinarily, the abstract
2203 | value is simply rounded and packed into the single-precision format, with
2204 | the inexact exception raised if the abstract input cannot be represented
2205 | exactly.  However, if the abstract value is too large, the overflow and
2206 | inexact exceptions are raised and an infinity or maximal finite value is
2207 | returned.  If the abstract value is too small, the input value is rounded to
2208 | a subnormal number, and the underflow and inexact exceptions are raised if
2209 | the abstract input cannot be represented exactly as a subnormal single-
2210 | precision floating-point number.
2211 |     The input significand `zSig' has its binary point between bits 30
2212 | and 29, which is 7 bits to the left of the usual location.  This shifted
2213 | significand must be normalized or smaller.  If `zSig' is not normalized,
2214 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2215 | and it must not require rounding.  In the usual case that `zSig' is
2216 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2217 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2218 | Binary Floating-Point Arithmetic.
2219 *----------------------------------------------------------------------------*/
2220
2221 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2222                                    float_status *status)
2223 {
2224     int8_t roundingMode;
2225     flag roundNearestEven;
2226     int8_t roundIncrement, roundBits;
2227     flag isTiny;
2228
2229     roundingMode = status->float_rounding_mode;
2230     roundNearestEven = ( roundingMode == float_round_nearest_even );
2231     switch (roundingMode) {
2232     case float_round_nearest_even:
2233     case float_round_ties_away:
2234         roundIncrement = 0x40;
2235         break;
2236     case float_round_to_zero:
2237         roundIncrement = 0;
2238         break;
2239     case float_round_up:
2240         roundIncrement = zSign ? 0 : 0x7f;
2241         break;
2242     case float_round_down:
2243         roundIncrement = zSign ? 0x7f : 0;
2244         break;
2245     default:
2246         abort();
2247         break;
2248     }
2249     roundBits = zSig & 0x7F;
2250     if ( 0xFD <= (uint16_t) zExp ) {
2251         if (    ( 0xFD < zExp )
2252              || (    ( zExp == 0xFD )
2253                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2254            ) {
2255             float_raise(float_flag_overflow | float_flag_inexact, status);
2256             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2257         }
2258         if ( zExp < 0 ) {
2259             if (status->flush_to_zero) {
2260                 float_raise(float_flag_output_denormal, status);
2261                 return packFloat32(zSign, 0, 0);
2262             }
2263             isTiny =
2264                 (status->float_detect_tininess
2265                  == float_tininess_before_rounding)
2266                 || ( zExp < -1 )
2267                 || ( zSig + roundIncrement < 0x80000000 );
2268             shift32RightJamming( zSig, - zExp, &zSig );
2269             zExp = 0;
2270             roundBits = zSig & 0x7F;
2271             if (isTiny && roundBits) {
2272                 float_raise(float_flag_underflow, status);
2273             }
2274         }
2275     }
2276     if (roundBits) {
2277         status->float_exception_flags |= float_flag_inexact;
2278     }
2279     zSig = ( zSig + roundIncrement )>>7;
2280     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2281     if ( zSig == 0 ) zExp = 0;
2282     return packFloat32( zSign, zExp, zSig );
2283
2284 }
2285
2286 /*----------------------------------------------------------------------------
2287 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2288 | and significand `zSig', and returns the proper single-precision floating-
2289 | point value corresponding to the abstract input.  This routine is just like
2290 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2291 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2292 | floating-point exponent.
2293 *----------------------------------------------------------------------------*/
2294
2295 static float32
2296  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2297                               float_status *status)
2298 {
2299     int8_t shiftCount;
2300
2301     shiftCount = countLeadingZeros32( zSig ) - 1;
2302     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2303                                status);
2304
2305 }
2306
2307 /*----------------------------------------------------------------------------
2308 | If `a' is denormal and we are in flush-to-zero mode then set the
2309 | input-denormal exception and return zero. Otherwise just return the value.
2310 *----------------------------------------------------------------------------*/
2311 float64 float64_squash_input_denormal(float64 a, float_status *status)
2312 {
2313     if (status->flush_inputs_to_zero) {
2314         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2315             float_raise(float_flag_input_denormal, status);
2316             return make_float64(float64_val(a) & (1ULL << 63));
2317         }
2318     }
2319     return a;
2320 }
2321
2322 /*----------------------------------------------------------------------------
2323 | Normalizes the subnormal double-precision floating-point value represented
2324 | by the denormalized significand `aSig'.  The normalized exponent and
2325 | significand are stored at the locations pointed to by `zExpPtr' and
2326 | `zSigPtr', respectively.
2327 *----------------------------------------------------------------------------*/
2328
2329 static void
2330  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2331 {
2332     int8_t shiftCount;
2333
2334     shiftCount = countLeadingZeros64( aSig ) - 11;
2335     *zSigPtr = aSig<<shiftCount;
2336     *zExpPtr = 1 - shiftCount;
2337
2338 }
2339
2340 /*----------------------------------------------------------------------------
2341 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2342 | double-precision floating-point value, returning the result.  After being
2343 | shifted into the proper positions, the three fields are simply added
2344 | together to form the result.  This means that any integer portion of `zSig'
2345 | will be added into the exponent.  Since a properly normalized significand
2346 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2347 | than the desired result exponent whenever `zSig' is a complete, normalized
2348 | significand.
2349 *----------------------------------------------------------------------------*/
2350
2351 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2352 {
2353
2354     return make_float64(
2355         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2356
2357 }
2358
2359 /*----------------------------------------------------------------------------
2360 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2361 | and significand `zSig', and returns the proper double-precision floating-
2362 | point value corresponding to the abstract input.  Ordinarily, the abstract
2363 | value is simply rounded and packed into the double-precision format, with
2364 | the inexact exception raised if the abstract input cannot be represented
2365 | exactly.  However, if the abstract value is too large, the overflow and
2366 | inexact exceptions are raised and an infinity or maximal finite value is
2367 | returned.  If the abstract value is too small, the input value is rounded to
2368 | a subnormal number, and the underflow and inexact exceptions are raised if
2369 | the abstract input cannot be represented exactly as a subnormal double-
2370 | precision floating-point number.
2371 |     The input significand `zSig' has its binary point between bits 62
2372 | and 61, which is 10 bits to the left of the usual location.  This shifted
2373 | significand must be normalized or smaller.  If `zSig' is not normalized,
2374 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2375 | and it must not require rounding.  In the usual case that `zSig' is
2376 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2377 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2378 | Binary Floating-Point Arithmetic.
2379 *----------------------------------------------------------------------------*/
2380
2381 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2382                                    float_status *status)
2383 {
2384     int8_t roundingMode;
2385     flag roundNearestEven;
2386     int roundIncrement, roundBits;
2387     flag isTiny;
2388
2389     roundingMode = status->float_rounding_mode;
2390     roundNearestEven = ( roundingMode == float_round_nearest_even );
2391     switch (roundingMode) {
2392     case float_round_nearest_even:
2393     case float_round_ties_away:
2394         roundIncrement = 0x200;
2395         break;
2396     case float_round_to_zero:
2397         roundIncrement = 0;
2398         break;
2399     case float_round_up:
2400         roundIncrement = zSign ? 0 : 0x3ff;
2401         break;
2402     case float_round_down:
2403         roundIncrement = zSign ? 0x3ff : 0;
2404         break;
2405     case float_round_to_odd:
2406         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2407         break;
2408     default:
2409         abort();
2410     }
2411     roundBits = zSig & 0x3FF;
2412     if ( 0x7FD <= (uint16_t) zExp ) {
2413         if (    ( 0x7FD < zExp )
2414              || (    ( zExp == 0x7FD )
2415                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2416            ) {
2417             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2418                                    roundIncrement != 0;
2419             float_raise(float_flag_overflow | float_flag_inexact, status);
2420             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2421         }
2422         if ( zExp < 0 ) {
2423             if (status->flush_to_zero) {
2424                 float_raise(float_flag_output_denormal, status);
2425                 return packFloat64(zSign, 0, 0);
2426             }
2427             isTiny =
2428                    (status->float_detect_tininess
2429                     == float_tininess_before_rounding)
2430                 || ( zExp < -1 )
2431                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2432             shift64RightJamming( zSig, - zExp, &zSig );
2433             zExp = 0;
2434             roundBits = zSig & 0x3FF;
2435             if (isTiny && roundBits) {
2436                 float_raise(float_flag_underflow, status);
2437             }
2438             if (roundingMode == float_round_to_odd) {
2439                 /*
2440                  * For round-to-odd case, the roundIncrement depends on
2441                  * zSig which just changed.
2442                  */
2443                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2444             }
2445         }
2446     }
2447     if (roundBits) {
2448         status->float_exception_flags |= float_flag_inexact;
2449     }
2450     zSig = ( zSig + roundIncrement )>>10;
2451     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2452     if ( zSig == 0 ) zExp = 0;
2453     return packFloat64( zSign, zExp, zSig );
2454
2455 }
2456
2457 /*----------------------------------------------------------------------------
2458 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2459 | and significand `zSig', and returns the proper double-precision floating-
2460 | point value corresponding to the abstract input.  This routine is just like
2461 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2462 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2463 | floating-point exponent.
2464 *----------------------------------------------------------------------------*/
2465
2466 static float64
2467  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2468                               float_status *status)
2469 {
2470     int8_t shiftCount;
2471
2472     shiftCount = countLeadingZeros64( zSig ) - 1;
2473     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2474                                status);
2475
2476 }
2477
2478 /*----------------------------------------------------------------------------
2479 | Normalizes the subnormal extended double-precision floating-point value
2480 | represented by the denormalized significand `aSig'.  The normalized exponent
2481 | and significand are stored at the locations pointed to by `zExpPtr' and
2482 | `zSigPtr', respectively.
2483 *----------------------------------------------------------------------------*/
2484
2485 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2486                                 uint64_t *zSigPtr)
2487 {
2488     int8_t shiftCount;
2489
2490     shiftCount = countLeadingZeros64( aSig );
2491     *zSigPtr = aSig<<shiftCount;
2492     *zExpPtr = 1 - shiftCount;
2493 }
2494
2495 /*----------------------------------------------------------------------------
2496 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2497 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2498 | and returns the proper extended double-precision floating-point value
2499 | corresponding to the abstract input.  Ordinarily, the abstract value is
2500 | rounded and packed into the extended double-precision format, with the
2501 | inexact exception raised if the abstract input cannot be represented
2502 | exactly.  However, if the abstract value is too large, the overflow and
2503 | inexact exceptions are raised and an infinity or maximal finite value is
2504 | returned.  If the abstract value is too small, the input value is rounded to
2505 | a subnormal number, and the underflow and inexact exceptions are raised if
2506 | the abstract input cannot be represented exactly as a subnormal extended
2507 | double-precision floating-point number.
2508 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2509 | number of bits as single or double precision, respectively.  Otherwise, the
2510 | result is rounded to the full precision of the extended double-precision
2511 | format.
2512 |     The input significand must be normalized or smaller.  If the input
2513 | significand is not normalized, `zExp' must be 0; in that case, the result
2514 | returned is a subnormal number, and it must not require rounding.  The
2515 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2516 | Floating-Point Arithmetic.
2517 *----------------------------------------------------------------------------*/
2518
2519 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2520                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2521                               float_status *status)
2522 {
2523     int8_t roundingMode;
2524     flag roundNearestEven, increment, isTiny;
2525     int64_t roundIncrement, roundMask, roundBits;
2526
2527     roundingMode = status->float_rounding_mode;
2528     roundNearestEven = ( roundingMode == float_round_nearest_even );
2529     if ( roundingPrecision == 80 ) goto precision80;
2530     if ( roundingPrecision == 64 ) {
2531         roundIncrement = LIT64( 0x0000000000000400 );
2532         roundMask = LIT64( 0x00000000000007FF );
2533     }
2534     else if ( roundingPrecision == 32 ) {
2535         roundIncrement = LIT64( 0x0000008000000000 );
2536         roundMask = LIT64( 0x000000FFFFFFFFFF );
2537     }
2538     else {
2539         goto precision80;
2540     }
2541     zSig0 |= ( zSig1 != 0 );
2542     switch (roundingMode) {
2543     case float_round_nearest_even:
2544     case float_round_ties_away:
2545         break;
2546     case float_round_to_zero:
2547         roundIncrement = 0;
2548         break;
2549     case float_round_up:
2550         roundIncrement = zSign ? 0 : roundMask;
2551         break;
2552     case float_round_down:
2553         roundIncrement = zSign ? roundMask : 0;
2554         break;
2555     default:
2556         abort();
2557     }
2558     roundBits = zSig0 & roundMask;
2559     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2560         if (    ( 0x7FFE < zExp )
2561              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2562            ) {
2563             goto overflow;
2564         }
2565         if ( zExp <= 0 ) {
2566             if (status->flush_to_zero) {
2567                 float_raise(float_flag_output_denormal, status);
2568                 return packFloatx80(zSign, 0, 0);
2569             }
2570             isTiny =
2571                    (status->float_detect_tininess
2572                     == float_tininess_before_rounding)
2573                 || ( zExp < 0 )
2574                 || ( zSig0 <= zSig0 + roundIncrement );
2575             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2576             zExp = 0;
2577             roundBits = zSig0 & roundMask;
2578             if (isTiny && roundBits) {
2579                 float_raise(float_flag_underflow, status);
2580             }
2581             if (roundBits) {
2582                 status->float_exception_flags |= float_flag_inexact;
2583             }
2584             zSig0 += roundIncrement;
2585             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2586             roundIncrement = roundMask + 1;
2587             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2588                 roundMask |= roundIncrement;
2589             }
2590             zSig0 &= ~ roundMask;
2591             return packFloatx80( zSign, zExp, zSig0 );
2592         }
2593     }
2594     if (roundBits) {
2595         status->float_exception_flags |= float_flag_inexact;
2596     }
2597     zSig0 += roundIncrement;
2598     if ( zSig0 < roundIncrement ) {
2599         ++zExp;
2600         zSig0 = LIT64( 0x8000000000000000 );
2601     }
2602     roundIncrement = roundMask + 1;
2603     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2604         roundMask |= roundIncrement;
2605     }
2606     zSig0 &= ~ roundMask;
2607     if ( zSig0 == 0 ) zExp = 0;
2608     return packFloatx80( zSign, zExp, zSig0 );
2609  precision80:
2610     switch (roundingMode) {
2611     case float_round_nearest_even:
2612     case float_round_ties_away:
2613         increment = ((int64_t)zSig1 < 0);
2614         break;
2615     case float_round_to_zero:
2616         increment = 0;
2617         break;
2618     case float_round_up:
2619         increment = !zSign && zSig1;
2620         break;
2621     case float_round_down:
2622         increment = zSign && zSig1;
2623         break;
2624     default:
2625         abort();
2626     }
2627     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2628         if (    ( 0x7FFE < zExp )
2629              || (    ( zExp == 0x7FFE )
2630                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2631                   && increment
2632                 )
2633            ) {
2634             roundMask = 0;
2635  overflow:
2636             float_raise(float_flag_overflow | float_flag_inexact, status);
2637             if (    ( roundingMode == float_round_to_zero )
2638                  || ( zSign && ( roundingMode == float_round_up ) )
2639                  || ( ! zSign && ( roundingMode == float_round_down ) )
2640                ) {
2641                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2642             }
2643             return packFloatx80(zSign,
2644                                 floatx80_infinity_high,
2645                                 floatx80_infinity_low);
2646         }
2647         if ( zExp <= 0 ) {
2648             isTiny =
2649                    (status->float_detect_tininess
2650                     == float_tininess_before_rounding)
2651                 || ( zExp < 0 )
2652                 || ! increment
2653                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2654             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2655             zExp = 0;
2656             if (isTiny && zSig1) {
2657                 float_raise(float_flag_underflow, status);
2658             }
2659             if (zSig1) {
2660                 status->float_exception_flags |= float_flag_inexact;
2661             }
2662             switch (roundingMode) {
2663             case float_round_nearest_even:
2664             case float_round_ties_away:
2665                 increment = ((int64_t)zSig1 < 0);
2666                 break;
2667             case float_round_to_zero:
2668                 increment = 0;
2669                 break;
2670             case float_round_up:
2671                 increment = !zSign && zSig1;
2672                 break;
2673             case float_round_down:
2674                 increment = zSign && zSig1;
2675                 break;
2676             default:
2677                 abort();
2678             }
2679             if ( increment ) {
2680                 ++zSig0;
2681                 zSig0 &=
2682                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2683                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2684             }
2685             return packFloatx80( zSign, zExp, zSig0 );
2686         }
2687     }
2688     if (zSig1) {
2689         status->float_exception_flags |= float_flag_inexact;
2690     }
2691     if ( increment ) {
2692         ++zSig0;
2693         if ( zSig0 == 0 ) {
2694             ++zExp;
2695             zSig0 = LIT64( 0x8000000000000000 );
2696         }
2697         else {
2698             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2699         }
2700     }
2701     else {
2702         if ( zSig0 == 0 ) zExp = 0;
2703     }
2704     return packFloatx80( zSign, zExp, zSig0 );
2705
2706 }
2707
2708 /*----------------------------------------------------------------------------
2709 | Takes an abstract floating-point value having sign `zSign', exponent
2710 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2711 | and returns the proper extended double-precision floating-point value
2712 | corresponding to the abstract input.  This routine is just like
2713 | `roundAndPackFloatx80' except that the input significand does not have to be
2714 | normalized.
2715 *----------------------------------------------------------------------------*/
2716
2717 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2718                                        flag zSign, int32_t zExp,
2719                                        uint64_t zSig0, uint64_t zSig1,
2720                                        float_status *status)
2721 {
2722     int8_t shiftCount;
2723
2724     if ( zSig0 == 0 ) {
2725         zSig0 = zSig1;
2726         zSig1 = 0;
2727         zExp -= 64;
2728     }
2729     shiftCount = countLeadingZeros64( zSig0 );
2730     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2731     zExp -= shiftCount;
2732     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2733                                 zSig0, zSig1, status);
2734
2735 }
2736
2737 /*----------------------------------------------------------------------------
2738 | Returns the least-significant 64 fraction bits of the quadruple-precision
2739 | floating-point value `a'.
2740 *----------------------------------------------------------------------------*/
2741
2742 static inline uint64_t extractFloat128Frac1( float128 a )
2743 {
2744
2745     return a.low;
2746
2747 }
2748
2749 /*----------------------------------------------------------------------------
2750 | Returns the most-significant 48 fraction bits of the quadruple-precision
2751 | floating-point value `a'.
2752 *----------------------------------------------------------------------------*/
2753
2754 static inline uint64_t extractFloat128Frac0( float128 a )
2755 {
2756
2757     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2758
2759 }
2760
2761 /*----------------------------------------------------------------------------
2762 | Returns the exponent bits of the quadruple-precision floating-point value
2763 | `a'.
2764 *----------------------------------------------------------------------------*/
2765
2766 static inline int32_t extractFloat128Exp( float128 a )
2767 {
2768
2769     return ( a.high>>48 ) & 0x7FFF;
2770
2771 }
2772
2773 /*----------------------------------------------------------------------------
2774 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2775 *----------------------------------------------------------------------------*/
2776
2777 static inline flag extractFloat128Sign( float128 a )
2778 {
2779
2780     return a.high>>63;
2781
2782 }
2783
2784 /*----------------------------------------------------------------------------
2785 | Normalizes the subnormal quadruple-precision floating-point value
2786 | represented by the denormalized significand formed by the concatenation of
2787 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2788 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2789 | significand are stored at the location pointed to by `zSig0Ptr', and the
2790 | least significant 64 bits of the normalized significand are stored at the
2791 | location pointed to by `zSig1Ptr'.
2792 *----------------------------------------------------------------------------*/
2793
2794 static void
2795  normalizeFloat128Subnormal(
2796      uint64_t aSig0,
2797      uint64_t aSig1,
2798      int32_t *zExpPtr,
2799      uint64_t *zSig0Ptr,
2800      uint64_t *zSig1Ptr
2801  )
2802 {
2803     int8_t shiftCount;
2804
2805     if ( aSig0 == 0 ) {
2806         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2807         if ( shiftCount < 0 ) {
2808             *zSig0Ptr = aSig1>>( - shiftCount );
2809             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2810         }
2811         else {
2812             *zSig0Ptr = aSig1<<shiftCount;
2813             *zSig1Ptr = 0;
2814         }
2815         *zExpPtr = - shiftCount - 63;
2816     }
2817     else {
2818         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2819         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2820         *zExpPtr = 1 - shiftCount;
2821     }
2822
2823 }
2824
2825 /*----------------------------------------------------------------------------
2826 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2827 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2828 | floating-point value, returning the result.  After being shifted into the
2829 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2830 | added together to form the most significant 32 bits of the result.  This
2831 | means that any integer portion of `zSig0' will be added into the exponent.
2832 | Since a properly normalized significand will have an integer portion equal
2833 | to 1, the `zExp' input should be 1 less than the desired result exponent
2834 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2835 | significand.
2836 *----------------------------------------------------------------------------*/
2837
2838 static inline float128
2839  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2840 {
2841     float128 z;
2842
2843     z.low = zSig1;
2844     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2845     return z;
2846
2847 }
2848
2849 /*----------------------------------------------------------------------------
2850 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2851 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2852 | and `zSig2', and returns the proper quadruple-precision floating-point value
2853 | corresponding to the abstract input.  Ordinarily, the abstract value is
2854 | simply rounded and packed into the quadruple-precision format, with the
2855 | inexact exception raised if the abstract input cannot be represented
2856 | exactly.  However, if the abstract value is too large, the overflow and
2857 | inexact exceptions are raised and an infinity or maximal finite value is
2858 | returned.  If the abstract value is too small, the input value is rounded to
2859 | a subnormal number, and the underflow and inexact exceptions are raised if
2860 | the abstract input cannot be represented exactly as a subnormal quadruple-
2861 | precision floating-point number.
2862 |     The input significand must be normalized or smaller.  If the input
2863 | significand is not normalized, `zExp' must be 0; in that case, the result
2864 | returned is a subnormal number, and it must not require rounding.  In the
2865 | usual case that the input significand is normalized, `zExp' must be 1 less
2866 | than the ``true'' floating-point exponent.  The handling of underflow and
2867 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2868 *----------------------------------------------------------------------------*/
2869
2870 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2871                                      uint64_t zSig0, uint64_t zSig1,
2872                                      uint64_t zSig2, float_status *status)
2873 {
2874     int8_t roundingMode;
2875     flag roundNearestEven, increment, isTiny;
2876
2877     roundingMode = status->float_rounding_mode;
2878     roundNearestEven = ( roundingMode == float_round_nearest_even );
2879     switch (roundingMode) {
2880     case float_round_nearest_even:
2881     case float_round_ties_away:
2882         increment = ((int64_t)zSig2 < 0);
2883         break;
2884     case float_round_to_zero:
2885         increment = 0;
2886         break;
2887     case float_round_up:
2888         increment = !zSign && zSig2;
2889         break;
2890     case float_round_down:
2891         increment = zSign && zSig2;
2892         break;
2893     case float_round_to_odd:
2894         increment = !(zSig1 & 0x1) && zSig2;
2895         break;
2896     default:
2897         abort();
2898     }
2899     if ( 0x7FFD <= (uint32_t) zExp ) {
2900         if (    ( 0x7FFD < zExp )
2901              || (    ( zExp == 0x7FFD )
2902                   && eq128(
2903                          LIT64( 0x0001FFFFFFFFFFFF ),
2904                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2905                          zSig0,
2906                          zSig1
2907                      )
2908                   && increment
2909                 )
2910            ) {
2911             float_raise(float_flag_overflow | float_flag_inexact, status);
2912             if (    ( roundingMode == float_round_to_zero )
2913                  || ( zSign && ( roundingMode == float_round_up ) )
2914                  || ( ! zSign && ( roundingMode == float_round_down ) )
2915                  || (roundingMode == float_round_to_odd)
2916                ) {
2917                 return
2918                     packFloat128(
2919                         zSign,
2920                         0x7FFE,
2921                         LIT64( 0x0000FFFFFFFFFFFF ),
2922                         LIT64( 0xFFFFFFFFFFFFFFFF )
2923                     );
2924             }
2925             return packFloat128( zSign, 0x7FFF, 0, 0 );
2926         }
2927         if ( zExp < 0 ) {
2928             if (status->flush_to_zero) {
2929                 float_raise(float_flag_output_denormal, status);
2930                 return packFloat128(zSign, 0, 0, 0);
2931             }
2932             isTiny =
2933                    (status->float_detect_tininess
2934                     == float_tininess_before_rounding)
2935                 || ( zExp < -1 )
2936                 || ! increment
2937                 || lt128(
2938                        zSig0,
2939                        zSig1,
2940                        LIT64( 0x0001FFFFFFFFFFFF ),
2941                        LIT64( 0xFFFFFFFFFFFFFFFF )
2942                    );
2943             shift128ExtraRightJamming(
2944                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2945             zExp = 0;
2946             if (isTiny && zSig2) {
2947                 float_raise(float_flag_underflow, status);
2948             }
2949             switch (roundingMode) {
2950             case float_round_nearest_even:
2951             case float_round_ties_away:
2952                 increment = ((int64_t)zSig2 < 0);
2953                 break;
2954             case float_round_to_zero:
2955                 increment = 0;
2956                 break;
2957             case float_round_up:
2958                 increment = !zSign && zSig2;
2959                 break;
2960             case float_round_down:
2961                 increment = zSign && zSig2;
2962                 break;
2963             case float_round_to_odd:
2964                 increment = !(zSig1 & 0x1) && zSig2;
2965                 break;
2966             default:
2967                 abort();
2968             }
2969         }
2970     }
2971     if (zSig2) {
2972         status->float_exception_flags |= float_flag_inexact;
2973     }
2974     if ( increment ) {
2975         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2976         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2977     }
2978     else {
2979         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2980     }
2981     return packFloat128( zSign, zExp, zSig0, zSig1 );
2982
2983 }
2984
2985 /*----------------------------------------------------------------------------
2986 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2987 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2988 | returns the proper quadruple-precision floating-point value corresponding
2989 | to the abstract input.  This routine is just like `roundAndPackFloat128'
2990 | except that the input significand has fewer bits and does not have to be
2991 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
2992 | point exponent.
2993 *----------------------------------------------------------------------------*/
2994
2995 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2996                                               uint64_t zSig0, uint64_t zSig1,
2997                                               float_status *status)
2998 {
2999     int8_t shiftCount;
3000     uint64_t zSig2;
3001
3002     if ( zSig0 == 0 ) {
3003         zSig0 = zSig1;
3004         zSig1 = 0;
3005         zExp -= 64;
3006     }
3007     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3008     if ( 0 <= shiftCount ) {
3009         zSig2 = 0;
3010         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3011     }
3012     else {
3013         shift128ExtraRightJamming(
3014             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3015     }
3016     zExp -= shiftCount;
3017     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3018
3019 }
3020
3021
3022 /*----------------------------------------------------------------------------
3023 | Returns the result of converting the 32-bit two's complement integer `a'
3024 | to the extended double-precision floating-point format.  The conversion
3025 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3026 | Arithmetic.
3027 *----------------------------------------------------------------------------*/
3028
3029 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3030 {
3031     flag zSign;
3032     uint32_t absA;
3033     int8_t shiftCount;
3034     uint64_t zSig;
3035
3036     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3037     zSign = ( a < 0 );
3038     absA = zSign ? - a : a;
3039     shiftCount = countLeadingZeros32( absA ) + 32;
3040     zSig = absA;
3041     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3042
3043 }
3044
3045 /*----------------------------------------------------------------------------
3046 | Returns the result of converting the 32-bit two's complement integer `a' to
3047 | the quadruple-precision floating-point format.  The conversion is performed
3048 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3049 *----------------------------------------------------------------------------*/
3050
3051 float128 int32_to_float128(int32_t a, float_status *status)
3052 {
3053     flag zSign;
3054     uint32_t absA;
3055     int8_t shiftCount;
3056     uint64_t zSig0;
3057
3058     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3059     zSign = ( a < 0 );
3060     absA = zSign ? - a : a;
3061     shiftCount = countLeadingZeros32( absA ) + 17;
3062     zSig0 = absA;
3063     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3064
3065 }
3066
3067 /*----------------------------------------------------------------------------
3068 | Returns the result of converting the 64-bit two's complement integer `a'
3069 | to the extended double-precision floating-point format.  The conversion
3070 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3071 | Arithmetic.
3072 *----------------------------------------------------------------------------*/
3073
3074 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3075 {
3076     flag zSign;
3077     uint64_t absA;
3078     int8_t shiftCount;
3079
3080     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3081     zSign = ( a < 0 );
3082     absA = zSign ? - a : a;
3083     shiftCount = countLeadingZeros64( absA );
3084     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3085
3086 }
3087
3088 /*----------------------------------------------------------------------------
3089 | Returns the result of converting the 64-bit two's complement integer `a' to
3090 | the quadruple-precision floating-point format.  The conversion is performed
3091 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3092 *----------------------------------------------------------------------------*/
3093
3094 float128 int64_to_float128(int64_t a, float_status *status)
3095 {
3096     flag zSign;
3097     uint64_t absA;
3098     int8_t shiftCount;
3099     int32_t zExp;
3100     uint64_t zSig0, zSig1;
3101
3102     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3103     zSign = ( a < 0 );
3104     absA = zSign ? - a : a;
3105     shiftCount = countLeadingZeros64( absA ) + 49;
3106     zExp = 0x406E - shiftCount;
3107     if ( 64 <= shiftCount ) {
3108         zSig1 = 0;
3109         zSig0 = absA;
3110         shiftCount -= 64;
3111     }
3112     else {
3113         zSig1 = absA;
3114         zSig0 = 0;
3115     }
3116     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3117     return packFloat128( zSign, zExp, zSig0, zSig1 );
3118
3119 }
3120
3121 /*----------------------------------------------------------------------------
3122 | Returns the result of converting the 64-bit unsigned integer `a'
3123 | to the quadruple-precision floating-point format.  The conversion is performed
3124 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3125 *----------------------------------------------------------------------------*/
3126
3127 float128 uint64_to_float128(uint64_t a, float_status *status)
3128 {
3129     if (a == 0) {
3130         return float128_zero;
3131     }
3132     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3133 }
3134
3135
3136
3137
3138 /*----------------------------------------------------------------------------
3139 | Returns the result of converting the single-precision floating-point value
3140 | `a' to the double-precision floating-point format.  The conversion is
3141 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3142 | Arithmetic.
3143 *----------------------------------------------------------------------------*/
3144
3145 float64 float32_to_float64(float32 a, float_status *status)
3146 {
3147     flag aSign;
3148     int aExp;
3149     uint32_t aSig;
3150     a = float32_squash_input_denormal(a, status);
3151
3152     aSig = extractFloat32Frac( a );
3153     aExp = extractFloat32Exp( a );
3154     aSign = extractFloat32Sign( a );
3155     if ( aExp == 0xFF ) {
3156         if (aSig) {
3157             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3158         }
3159         return packFloat64( aSign, 0x7FF, 0 );
3160     }
3161     if ( aExp == 0 ) {
3162         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3163         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3164         --aExp;
3165     }
3166     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3167
3168 }
3169
3170 /*----------------------------------------------------------------------------
3171 | Returns the result of converting the single-precision floating-point value
3172 | `a' to the extended double-precision floating-point format.  The conversion
3173 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3174 | Arithmetic.
3175 *----------------------------------------------------------------------------*/
3176
3177 floatx80 float32_to_floatx80(float32 a, float_status *status)
3178 {
3179     flag aSign;
3180     int aExp;
3181     uint32_t aSig;
3182
3183     a = float32_squash_input_denormal(a, status);
3184     aSig = extractFloat32Frac( a );
3185     aExp = extractFloat32Exp( a );
3186     aSign = extractFloat32Sign( a );
3187     if ( aExp == 0xFF ) {
3188         if (aSig) {
3189             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3190         }
3191         return packFloatx80(aSign,
3192                             floatx80_infinity_high,
3193                             floatx80_infinity_low);
3194     }
3195     if ( aExp == 0 ) {
3196         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3197         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3198     }
3199     aSig |= 0x00800000;
3200     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3201
3202 }
3203
3204 /*----------------------------------------------------------------------------
3205 | Returns the result of converting the single-precision floating-point value
3206 | `a' to the double-precision floating-point format.  The conversion is
3207 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3208 | Arithmetic.
3209 *----------------------------------------------------------------------------*/
3210
3211 float128 float32_to_float128(float32 a, float_status *status)
3212 {
3213     flag aSign;
3214     int aExp;
3215     uint32_t aSig;
3216
3217     a = float32_squash_input_denormal(a, status);
3218     aSig = extractFloat32Frac( a );
3219     aExp = extractFloat32Exp( a );
3220     aSign = extractFloat32Sign( a );
3221     if ( aExp == 0xFF ) {
3222         if (aSig) {
3223             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3224         }
3225         return packFloat128( aSign, 0x7FFF, 0, 0 );
3226     }
3227     if ( aExp == 0 ) {
3228         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3229         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3230         --aExp;
3231     }
3232     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3233
3234 }
3235
3236 /*----------------------------------------------------------------------------
3237 | Returns the remainder of the single-precision floating-point value `a'
3238 | with respect to the corresponding value `b'.  The operation is performed
3239 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3240 *----------------------------------------------------------------------------*/
3241
3242 float32 float32_rem(float32 a, float32 b, float_status *status)
3243 {
3244     flag aSign, zSign;
3245     int aExp, bExp, expDiff;
3246     uint32_t aSig, bSig;
3247     uint32_t q;
3248     uint64_t aSig64, bSig64, q64;
3249     uint32_t alternateASig;
3250     int32_t sigMean;
3251     a = float32_squash_input_denormal(a, status);
3252     b = float32_squash_input_denormal(b, status);
3253
3254     aSig = extractFloat32Frac( a );
3255     aExp = extractFloat32Exp( a );
3256     aSign = extractFloat32Sign( a );
3257     bSig = extractFloat32Frac( b );
3258     bExp = extractFloat32Exp( b );
3259     if ( aExp == 0xFF ) {
3260         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3261             return propagateFloat32NaN(a, b, status);
3262         }
3263         float_raise(float_flag_invalid, status);
3264         return float32_default_nan(status);
3265     }
3266     if ( bExp == 0xFF ) {
3267         if (bSig) {
3268             return propagateFloat32NaN(a, b, status);
3269         }
3270         return a;
3271     }
3272     if ( bExp == 0 ) {
3273         if ( bSig == 0 ) {
3274             float_raise(float_flag_invalid, status);
3275             return float32_default_nan(status);
3276         }
3277         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3278     }
3279     if ( aExp == 0 ) {
3280         if ( aSig == 0 ) return a;
3281         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3282     }
3283     expDiff = aExp - bExp;
3284     aSig |= 0x00800000;
3285     bSig |= 0x00800000;
3286     if ( expDiff < 32 ) {
3287         aSig <<= 8;
3288         bSig <<= 8;
3289         if ( expDiff < 0 ) {
3290             if ( expDiff < -1 ) return a;
3291             aSig >>= 1;
3292         }
3293         q = ( bSig <= aSig );
3294         if ( q ) aSig -= bSig;
3295         if ( 0 < expDiff ) {
3296             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3297             q >>= 32 - expDiff;
3298             bSig >>= 2;
3299             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3300         }
3301         else {
3302             aSig >>= 2;
3303             bSig >>= 2;
3304         }
3305     }
3306     else {
3307         if ( bSig <= aSig ) aSig -= bSig;
3308         aSig64 = ( (uint64_t) aSig )<<40;
3309         bSig64 = ( (uint64_t) bSig )<<40;
3310         expDiff -= 64;
3311         while ( 0 < expDiff ) {
3312             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3313             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3314             aSig64 = - ( ( bSig * q64 )<<38 );
3315             expDiff -= 62;
3316         }
3317         expDiff += 64;
3318         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3319         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3320         q = q64>>( 64 - expDiff );
3321         bSig <<= 6;
3322         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3323     }
3324     do {
3325         alternateASig = aSig;
3326         ++q;
3327         aSig -= bSig;
3328     } while ( 0 <= (int32_t) aSig );
3329     sigMean = aSig + alternateASig;
3330     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3331         aSig = alternateASig;
3332     }
3333     zSign = ( (int32_t) aSig < 0 );
3334     if ( zSign ) aSig = - aSig;
3335     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3336 }
3337
3338
3339
3340 /*----------------------------------------------------------------------------
3341 | Returns the binary exponential of the single-precision floating-point value
3342 | `a'. The operation is performed according to the IEC/IEEE Standard for
3343 | Binary Floating-Point Arithmetic.
3344 |
3345 | Uses the following identities:
3346 |
3347 | 1. -------------------------------------------------------------------------
3348 |      x    x*ln(2)
3349 |     2  = e
3350 |
3351 | 2. -------------------------------------------------------------------------
3352 |                      2     3     4     5           n
3353 |      x        x     x     x     x     x           x
3354 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3355 |               1!    2!    3!    4!    5!          n!
3356 *----------------------------------------------------------------------------*/
3357
3358 static const float64 float32_exp2_coefficients[15] =
3359 {
3360     const_float64( 0x3ff0000000000000ll ), /*  1 */
3361     const_float64( 0x3fe0000000000000ll ), /*  2 */
3362     const_float64( 0x3fc5555555555555ll ), /*  3 */
3363     const_float64( 0x3fa5555555555555ll ), /*  4 */
3364     const_float64( 0x3f81111111111111ll ), /*  5 */
3365     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3366     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3367     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3368     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3369     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3370     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3371     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3372     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3373     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3374     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3375 };
3376
3377 float32 float32_exp2(float32 a, float_status *status)
3378 {
3379     flag aSign;
3380     int aExp;
3381     uint32_t aSig;
3382     float64 r, x, xn;
3383     int i;
3384     a = float32_squash_input_denormal(a, status);
3385
3386     aSig = extractFloat32Frac( a );
3387     aExp = extractFloat32Exp( a );
3388     aSign = extractFloat32Sign( a );
3389
3390     if ( aExp == 0xFF) {
3391         if (aSig) {
3392             return propagateFloat32NaN(a, float32_zero, status);
3393         }
3394         return (aSign) ? float32_zero : a;
3395     }
3396     if (aExp == 0) {
3397         if (aSig == 0) return float32_one;
3398     }
3399
3400     float_raise(float_flag_inexact, status);
3401
3402     /* ******************************* */
3403     /* using float64 for approximation */
3404     /* ******************************* */
3405     x = float32_to_float64(a, status);
3406     x = float64_mul(x, float64_ln2, status);
3407
3408     xn = x;
3409     r = float64_one;
3410     for (i = 0 ; i < 15 ; i++) {
3411         float64 f;
3412
3413         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3414         r = float64_add(r, f, status);
3415
3416         xn = float64_mul(xn, x, status);
3417     }
3418
3419     return float64_to_float32(r, status);
3420 }
3421
3422 /*----------------------------------------------------------------------------
3423 | Returns the binary log of the single-precision floating-point value `a'.
3424 | The operation is performed according to the IEC/IEEE Standard for Binary
3425 | Floating-Point Arithmetic.
3426 *----------------------------------------------------------------------------*/
3427 float32 float32_log2(float32 a, float_status *status)
3428 {
3429     flag aSign, zSign;
3430     int aExp;
3431     uint32_t aSig, zSig, i;
3432
3433     a = float32_squash_input_denormal(a, status);
3434     aSig = extractFloat32Frac( a );
3435     aExp = extractFloat32Exp( a );
3436     aSign = extractFloat32Sign( a );
3437
3438     if ( aExp == 0 ) {
3439         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3440         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3441     }
3442     if ( aSign ) {
3443         float_raise(float_flag_invalid, status);
3444         return float32_default_nan(status);
3445     }
3446     if ( aExp == 0xFF ) {
3447         if (aSig) {
3448             return propagateFloat32NaN(a, float32_zero, status);
3449         }
3450         return a;
3451     }
3452
3453     aExp -= 0x7F;
3454     aSig |= 0x00800000;
3455     zSign = aExp < 0;
3456     zSig = aExp << 23;
3457
3458     for (i = 1 << 22; i > 0; i >>= 1) {
3459         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3460         if ( aSig & 0x01000000 ) {
3461             aSig >>= 1;
3462             zSig |= i;
3463         }
3464     }
3465
3466     if ( zSign )
3467         zSig = -zSig;
3468
3469     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3470 }
3471
3472 /*----------------------------------------------------------------------------
3473 | Returns 1 if the single-precision floating-point value `a' is equal to
3474 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3475 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3477 *----------------------------------------------------------------------------*/
3478
3479 int float32_eq(float32 a, float32 b, float_status *status)
3480 {
3481     uint32_t av, bv;
3482     a = float32_squash_input_denormal(a, status);
3483     b = float32_squash_input_denormal(b, status);
3484
3485     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3486          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3487        ) {
3488         float_raise(float_flag_invalid, status);
3489         return 0;
3490     }
3491     av = float32_val(a);
3492     bv = float32_val(b);
3493     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3494 }
3495
3496 /*----------------------------------------------------------------------------
3497 | Returns 1 if the single-precision floating-point value `a' is less than
3498 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3499 | exception is raised if either operand is a NaN.  The comparison is performed
3500 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3501 *----------------------------------------------------------------------------*/
3502
3503 int float32_le(float32 a, float32 b, float_status *status)
3504 {
3505     flag aSign, bSign;
3506     uint32_t av, bv;
3507     a = float32_squash_input_denormal(a, status);
3508     b = float32_squash_input_denormal(b, status);
3509
3510     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3511          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3512        ) {
3513         float_raise(float_flag_invalid, status);
3514         return 0;
3515     }
3516     aSign = extractFloat32Sign( a );
3517     bSign = extractFloat32Sign( b );
3518     av = float32_val(a);
3519     bv = float32_val(b);
3520     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3521     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3522
3523 }
3524
3525 /*----------------------------------------------------------------------------
3526 | Returns 1 if the single-precision floating-point value `a' is less than
3527 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3528 | raised if either operand is a NaN.  The comparison is performed according
3529 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3530 *----------------------------------------------------------------------------*/
3531
3532 int float32_lt(float32 a, float32 b, float_status *status)
3533 {
3534     flag aSign, bSign;
3535     uint32_t av, bv;
3536     a = float32_squash_input_denormal(a, status);
3537     b = float32_squash_input_denormal(b, status);
3538
3539     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3540          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3541        ) {
3542         float_raise(float_flag_invalid, status);
3543         return 0;
3544     }
3545     aSign = extractFloat32Sign( a );
3546     bSign = extractFloat32Sign( b );
3547     av = float32_val(a);
3548     bv = float32_val(b);
3549     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3550     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3551
3552 }
3553
3554 /*----------------------------------------------------------------------------
3555 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3556 | be compared, and 0 otherwise.  The invalid exception is raised if either
3557 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3558 | Standard for Binary Floating-Point Arithmetic.
3559 *----------------------------------------------------------------------------*/
3560
3561 int float32_unordered(float32 a, float32 b, float_status *status)
3562 {
3563     a = float32_squash_input_denormal(a, status);
3564     b = float32_squash_input_denormal(b, status);
3565
3566     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3567          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3568        ) {
3569         float_raise(float_flag_invalid, status);
3570         return 1;
3571     }
3572     return 0;
3573 }
3574
3575 /*----------------------------------------------------------------------------
3576 | Returns 1 if the single-precision floating-point value `a' is equal to
3577 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3578 | exception.  The comparison is performed according to the IEC/IEEE Standard
3579 | for Binary Floating-Point Arithmetic.
3580 *----------------------------------------------------------------------------*/
3581
3582 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3583 {
3584     a = float32_squash_input_denormal(a, status);
3585     b = float32_squash_input_denormal(b, status);
3586
3587     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3588          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3589        ) {
3590         if (float32_is_signaling_nan(a, status)
3591          || float32_is_signaling_nan(b, status)) {
3592             float_raise(float_flag_invalid, status);
3593         }
3594         return 0;
3595     }
3596     return ( float32_val(a) == float32_val(b) ) ||
3597             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3598 }
3599
3600 /*----------------------------------------------------------------------------
3601 | Returns 1 if the single-precision floating-point value `a' is less than or
3602 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3603 | cause an exception.  Otherwise, the comparison is performed according to the
3604 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3605 *----------------------------------------------------------------------------*/
3606
3607 int float32_le_quiet(float32 a, float32 b, float_status *status)
3608 {
3609     flag aSign, bSign;
3610     uint32_t av, bv;
3611     a = float32_squash_input_denormal(a, status);
3612     b = float32_squash_input_denormal(b, status);
3613
3614     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3615          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3616        ) {
3617         if (float32_is_signaling_nan(a, status)
3618          || float32_is_signaling_nan(b, status)) {
3619             float_raise(float_flag_invalid, status);
3620         }
3621         return 0;
3622     }
3623     aSign = extractFloat32Sign( a );
3624     bSign = extractFloat32Sign( b );
3625     av = float32_val(a);
3626     bv = float32_val(b);
3627     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3628     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3629
3630 }
3631
3632 /*----------------------------------------------------------------------------
3633 | Returns 1 if the single-precision floating-point value `a' is less than
3634 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3635 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3636 | Standard for Binary Floating-Point Arithmetic.
3637 *----------------------------------------------------------------------------*/
3638
3639 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3640 {
3641     flag aSign, bSign;
3642     uint32_t av, bv;
3643     a = float32_squash_input_denormal(a, status);
3644     b = float32_squash_input_denormal(b, status);
3645
3646     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3647          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3648        ) {
3649         if (float32_is_signaling_nan(a, status)
3650          || float32_is_signaling_nan(b, status)) {
3651             float_raise(float_flag_invalid, status);
3652         }
3653         return 0;
3654     }
3655     aSign = extractFloat32Sign( a );
3656     bSign = extractFloat32Sign( b );
3657     av = float32_val(a);
3658     bv = float32_val(b);
3659     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3660     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3661
3662 }
3663
3664 /*----------------------------------------------------------------------------
3665 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3666 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3667 | comparison is performed according to the IEC/IEEE Standard for Binary
3668 | Floating-Point Arithmetic.
3669 *----------------------------------------------------------------------------*/
3670
3671 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3672 {
3673     a = float32_squash_input_denormal(a, status);
3674     b = float32_squash_input_denormal(b, status);
3675
3676     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3677          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3678        ) {
3679         if (float32_is_signaling_nan(a, status)
3680          || float32_is_signaling_nan(b, status)) {
3681             float_raise(float_flag_invalid, status);
3682         }
3683         return 1;
3684     }
3685     return 0;
3686 }
3687
3688
3689 /*----------------------------------------------------------------------------
3690 | Returns the result of converting the double-precision floating-point value
3691 | `a' to the single-precision floating-point format.  The conversion is
3692 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3693 | Arithmetic.
3694 *----------------------------------------------------------------------------*/
3695
3696 float32 float64_to_float32(float64 a, float_status *status)
3697 {
3698     flag aSign;
3699     int aExp;
3700     uint64_t aSig;
3701     uint32_t zSig;
3702     a = float64_squash_input_denormal(a, status);
3703
3704     aSig = extractFloat64Frac( a );
3705     aExp = extractFloat64Exp( a );
3706     aSign = extractFloat64Sign( a );
3707     if ( aExp == 0x7FF ) {
3708         if (aSig) {
3709             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3710         }
3711         return packFloat32( aSign, 0xFF, 0 );
3712     }
3713     shift64RightJamming( aSig, 22, &aSig );
3714     zSig = aSig;
3715     if ( aExp || zSig ) {
3716         zSig |= 0x40000000;
3717         aExp -= 0x381;
3718     }
3719     return roundAndPackFloat32(aSign, aExp, zSig, status);
3720
3721 }
3722
3723
3724 /*----------------------------------------------------------------------------
3725 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3726 | half-precision floating-point value, returning the result.  After being
3727 | shifted into the proper positions, the three fields are simply added
3728 | together to form the result.  This means that any integer portion of `zSig'
3729 | will be added into the exponent.  Since a properly normalized significand
3730 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3731 | than the desired result exponent whenever `zSig' is a complete, normalized
3732 | significand.
3733 *----------------------------------------------------------------------------*/
3734 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3735 {
3736     return make_float16(
3737         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3738 }
3739
3740 /*----------------------------------------------------------------------------
3741 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3742 | and significand `zSig', and returns the proper half-precision floating-
3743 | point value corresponding to the abstract input.  Ordinarily, the abstract
3744 | value is simply rounded and packed into the half-precision format, with
3745 | the inexact exception raised if the abstract input cannot be represented
3746 | exactly.  However, if the abstract value is too large, the overflow and
3747 | inexact exceptions are raised and an infinity or maximal finite value is
3748 | returned.  If the abstract value is too small, the input value is rounded to
3749 | a subnormal number, and the underflow and inexact exceptions are raised if
3750 | the abstract input cannot be represented exactly as a subnormal half-
3751 | precision floating-point number.
3752 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3753 | ARM-style "alternative representation", which omits the NaN and Inf
3754 | encodings in order to raise the maximum representable exponent by one.
3755 |     The input significand `zSig' has its binary point between bits 22
3756 | and 23, which is 13 bits to the left of the usual location.  This shifted
3757 | significand must be normalized or smaller.  If `zSig' is not normalized,
3758 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3759 | and it must not require rounding.  In the usual case that `zSig' is
3760 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3761 | Note the slightly odd position of the binary point in zSig compared with the
3762 | other roundAndPackFloat functions. This should probably be fixed if we
3763 | need to implement more float16 routines than just conversion.
3764 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3765 | Binary Floating-Point Arithmetic.
3766 *----------------------------------------------------------------------------*/
3767
3768 static float16 roundAndPackFloat16(flag zSign, int zExp,
3769                                    uint32_t zSig, flag ieee,
3770                                    float_status *status)
3771 {
3772     int maxexp = ieee ? 29 : 30;
3773     uint32_t mask;
3774     uint32_t increment;
3775     bool rounding_bumps_exp;
3776     bool is_tiny = false;
3777
3778     /* Calculate the mask of bits of the mantissa which are not
3779      * representable in half-precision and will be lost.
3780      */
3781     if (zExp < 1) {
3782         /* Will be denormal in halfprec */
3783         mask = 0x00ffffff;
3784         if (zExp >= -11) {
3785             mask >>= 11 + zExp;
3786         }
3787     } else {
3788         /* Normal number in halfprec */
3789         mask = 0x00001fff;
3790     }
3791
3792     switch (status->float_rounding_mode) {
3793     case float_round_nearest_even:
3794         increment = (mask + 1) >> 1;
3795         if ((zSig & mask) == increment) {
3796             increment = zSig & (increment << 1);
3797         }
3798         break;
3799     case float_round_ties_away:
3800         increment = (mask + 1) >> 1;
3801         break;
3802     case float_round_up:
3803         increment = zSign ? 0 : mask;
3804         break;
3805     case float_round_down:
3806         increment = zSign ? mask : 0;
3807         break;
3808     default: /* round_to_zero */
3809         increment = 0;
3810         break;
3811     }
3812
3813     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3814
3815     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3816         if (ieee) {
3817             float_raise(float_flag_overflow | float_flag_inexact, status);
3818             return packFloat16(zSign, 0x1f, 0);
3819         } else {
3820             float_raise(float_flag_invalid, status);
3821             return packFloat16(zSign, 0x1f, 0x3ff);
3822         }
3823     }
3824
3825     if (zExp < 0) {
3826         /* Note that flush-to-zero does not affect half-precision results */
3827         is_tiny =
3828             (status->float_detect_tininess == float_tininess_before_rounding)
3829             || (zExp < -1)
3830             || (!rounding_bumps_exp);
3831     }
3832     if (zSig & mask) {
3833         float_raise(float_flag_inexact, status);
3834         if (is_tiny) {
3835             float_raise(float_flag_underflow, status);
3836         }
3837     }
3838
3839     zSig += increment;
3840     if (rounding_bumps_exp) {
3841         zSig >>= 1;
3842         zExp++;
3843     }
3844
3845     if (zExp < -10) {
3846         return packFloat16(zSign, 0, 0);
3847     }
3848     if (zExp < 0) {
3849         zSig >>= -zExp;
3850         zExp = 0;
3851     }
3852     return packFloat16(zSign, zExp, zSig >> 13);
3853 }
3854
3855 /*----------------------------------------------------------------------------
3856 | If `a' is denormal and we are in flush-to-zero mode then set the
3857 | input-denormal exception and return zero. Otherwise just return the value.
3858 *----------------------------------------------------------------------------*/
3859 float16 float16_squash_input_denormal(float16 a, float_status *status)
3860 {
3861     if (status->flush_inputs_to_zero) {
3862         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3863             float_raise(float_flag_input_denormal, status);
3864             return make_float16(float16_val(a) & 0x8000);
3865         }
3866     }
3867     return a;
3868 }
3869
3870 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3871                                       uint32_t *zSigPtr)
3872 {
3873     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3874     *zSigPtr = aSig << shiftCount;
3875     *zExpPtr = 1 - shiftCount;
3876 }
3877
3878 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3879    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3880
3881 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3882 {
3883     flag aSign;
3884     int aExp;
3885     uint32_t aSig;
3886
3887     aSign = extractFloat16Sign(a);
3888     aExp = extractFloat16Exp(a);
3889     aSig = extractFloat16Frac(a);
3890
3891     if (aExp == 0x1f && ieee) {
3892         if (aSig) {
3893             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3894         }
3895         return packFloat32(aSign, 0xff, 0);
3896     }
3897     if (aExp == 0) {
3898         if (aSig == 0) {
3899             return packFloat32(aSign, 0, 0);
3900         }
3901
3902         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3903         aExp--;
3904     }
3905     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3906 }
3907
3908 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3909 {
3910     flag aSign;
3911     int aExp;
3912     uint32_t aSig;
3913
3914     a = float32_squash_input_denormal(a, status);
3915
3916     aSig = extractFloat32Frac( a );
3917     aExp = extractFloat32Exp( a );
3918     aSign = extractFloat32Sign( a );
3919     if ( aExp == 0xFF ) {
3920         if (aSig) {
3921             /* Input is a NaN */
3922             if (!ieee) {
3923                 float_raise(float_flag_invalid, status);
3924                 return packFloat16(aSign, 0, 0);
3925             }
3926             return commonNaNToFloat16(
3927                 float32ToCommonNaN(a, status), status);
3928         }
3929         /* Infinity */
3930         if (!ieee) {
3931             float_raise(float_flag_invalid, status);
3932             return packFloat16(aSign, 0x1f, 0x3ff);
3933         }
3934         return packFloat16(aSign, 0x1f, 0);
3935     }
3936     if (aExp == 0 && aSig == 0) {
3937         return packFloat16(aSign, 0, 0);
3938     }
3939     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3940      * even if the input is denormal; however this is harmless because
3941      * the largest possible single-precision denormal is still smaller
3942      * than the smallest representable half-precision denormal, and so we
3943      * will end up ignoring aSig and returning via the "always return zero"
3944      * codepath.
3945      */
3946     aSig |= 0x00800000;
3947     aExp -= 0x71;
3948
3949     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3950 }
3951
3952 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3953 {
3954     flag aSign;
3955     int aExp;
3956     uint32_t aSig;
3957
3958     aSign = extractFloat16Sign(a);
3959     aExp = extractFloat16Exp(a);
3960     aSig = extractFloat16Frac(a);
3961
3962     if (aExp == 0x1f && ieee) {
3963         if (aSig) {
3964             return commonNaNToFloat64(
3965                 float16ToCommonNaN(a, status), status);
3966         }
3967         return packFloat64(aSign, 0x7ff, 0);
3968     }
3969     if (aExp == 0) {
3970         if (aSig == 0) {
3971             return packFloat64(aSign, 0, 0);
3972         }
3973
3974         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3975         aExp--;
3976     }
3977     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3978 }
3979
3980 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3981 {
3982     flag aSign;
3983     int aExp;
3984     uint64_t aSig;
3985     uint32_t zSig;
3986
3987     a = float64_squash_input_denormal(a, status);
3988
3989     aSig = extractFloat64Frac(a);
3990     aExp = extractFloat64Exp(a);
3991     aSign = extractFloat64Sign(a);
3992     if (aExp == 0x7FF) {
3993         if (aSig) {
3994             /* Input is a NaN */
3995             if (!ieee) {
3996                 float_raise(float_flag_invalid, status);
3997                 return packFloat16(aSign, 0, 0);
3998             }
3999             return commonNaNToFloat16(
4000                 float64ToCommonNaN(a, status), status);
4001         }
4002         /* Infinity */
4003         if (!ieee) {
4004             float_raise(float_flag_invalid, status);
4005             return packFloat16(aSign, 0x1f, 0x3ff);
4006         }
4007         return packFloat16(aSign, 0x1f, 0);
4008     }
4009     shift64RightJamming(aSig, 29, &aSig);
4010     zSig = aSig;
4011     if (aExp == 0 && zSig == 0) {
4012         return packFloat16(aSign, 0, 0);
4013     }
4014     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4015      * even if the input is denormal; however this is harmless because
4016      * the largest possible single-precision denormal is still smaller
4017      * than the smallest representable half-precision denormal, and so we
4018      * will end up ignoring aSig and returning via the "always return zero"
4019      * codepath.
4020      */
4021     zSig |= 0x00800000;
4022     aExp -= 0x3F1;
4023
4024     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4025 }
4026
4027 /*----------------------------------------------------------------------------
4028 | Returns the result of converting the double-precision floating-point value
4029 | `a' to the extended double-precision floating-point format.  The conversion
4030 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4031 | Arithmetic.
4032 *----------------------------------------------------------------------------*/
4033
4034 floatx80 float64_to_floatx80(float64 a, float_status *status)
4035 {
4036     flag aSign;
4037     int aExp;
4038     uint64_t aSig;
4039
4040     a = float64_squash_input_denormal(a, status);
4041     aSig = extractFloat64Frac( a );
4042     aExp = extractFloat64Exp( a );
4043     aSign = extractFloat64Sign( a );
4044     if ( aExp == 0x7FF ) {
4045         if (aSig) {
4046             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4047         }
4048         return packFloatx80(aSign,
4049                             floatx80_infinity_high,
4050                             floatx80_infinity_low);
4051     }
4052     if ( aExp == 0 ) {
4053         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4054         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4055     }
4056     return
4057         packFloatx80(
4058             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4059
4060 }
4061
4062 /*----------------------------------------------------------------------------
4063 | Returns the result of converting the double-precision floating-point value
4064 | `a' to the quadruple-precision floating-point format.  The conversion is
4065 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4066 | Arithmetic.
4067 *----------------------------------------------------------------------------*/
4068
4069 float128 float64_to_float128(float64 a, float_status *status)
4070 {
4071     flag aSign;
4072     int aExp;
4073     uint64_t aSig, zSig0, zSig1;
4074
4075     a = float64_squash_input_denormal(a, status);
4076     aSig = extractFloat64Frac( a );
4077     aExp = extractFloat64Exp( a );
4078     aSign = extractFloat64Sign( a );
4079     if ( aExp == 0x7FF ) {
4080         if (aSig) {
4081             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4082         }
4083         return packFloat128( aSign, 0x7FFF, 0, 0 );
4084     }
4085     if ( aExp == 0 ) {
4086         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4087         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4088         --aExp;
4089     }
4090     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4091     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4092
4093 }
4094
4095
4096 /*----------------------------------------------------------------------------
4097 | Returns the remainder of the double-precision floating-point value `a'
4098 | with respect to the corresponding value `b'.  The operation is performed
4099 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4100 *----------------------------------------------------------------------------*/
4101
4102 float64 float64_rem(float64 a, float64 b, float_status *status)
4103 {
4104     flag aSign, zSign;
4105     int aExp, bExp, expDiff;
4106     uint64_t aSig, bSig;
4107     uint64_t q, alternateASig;
4108     int64_t sigMean;
4109
4110     a = float64_squash_input_denormal(a, status);
4111     b = float64_squash_input_denormal(b, status);
4112     aSig = extractFloat64Frac( a );
4113     aExp = extractFloat64Exp( a );
4114     aSign = extractFloat64Sign( a );
4115     bSig = extractFloat64Frac( b );
4116     bExp = extractFloat64Exp( b );
4117     if ( aExp == 0x7FF ) {
4118         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4119             return propagateFloat64NaN(a, b, status);
4120         }
4121         float_raise(float_flag_invalid, status);
4122         return float64_default_nan(status);
4123     }
4124     if ( bExp == 0x7FF ) {
4125         if (bSig) {
4126             return propagateFloat64NaN(a, b, status);
4127         }
4128         return a;
4129     }
4130     if ( bExp == 0 ) {
4131         if ( bSig == 0 ) {
4132             float_raise(float_flag_invalid, status);
4133             return float64_default_nan(status);
4134         }
4135         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4136     }
4137     if ( aExp == 0 ) {
4138         if ( aSig == 0 ) return a;
4139         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4140     }
4141     expDiff = aExp - bExp;
4142     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4143     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4144     if ( expDiff < 0 ) {
4145         if ( expDiff < -1 ) return a;
4146         aSig >>= 1;
4147     }
4148     q = ( bSig <= aSig );
4149     if ( q ) aSig -= bSig;
4150     expDiff -= 64;
4151     while ( 0 < expDiff ) {
4152         q = estimateDiv128To64( aSig, 0, bSig );
4153         q = ( 2 < q ) ? q - 2 : 0;
4154         aSig = - ( ( bSig>>2 ) * q );
4155         expDiff -= 62;
4156     }
4157     expDiff += 64;
4158     if ( 0 < expDiff ) {
4159         q = estimateDiv128To64( aSig, 0, bSig );
4160         q = ( 2 < q ) ? q - 2 : 0;
4161         q >>= 64 - expDiff;
4162         bSig >>= 2;
4163         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4164     }
4165     else {
4166         aSig >>= 2;
4167         bSig >>= 2;
4168     }
4169     do {
4170         alternateASig = aSig;
4171         ++q;
4172         aSig -= bSig;
4173     } while ( 0 <= (int64_t) aSig );
4174     sigMean = aSig + alternateASig;
4175     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4176         aSig = alternateASig;
4177     }
4178     zSign = ( (int64_t) aSig < 0 );
4179     if ( zSign ) aSig = - aSig;
4180     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4181
4182 }
4183
4184 /*----------------------------------------------------------------------------
4185 | Returns the binary log of the double-precision floating-point value `a'.
4186 | The operation is performed according to the IEC/IEEE Standard for Binary
4187 | Floating-Point Arithmetic.
4188 *----------------------------------------------------------------------------*/
4189 float64 float64_log2(float64 a, float_status *status)
4190 {
4191     flag aSign, zSign;
4192     int aExp;
4193     uint64_t aSig, aSig0, aSig1, zSig, i;
4194     a = float64_squash_input_denormal(a, status);
4195
4196     aSig = extractFloat64Frac( a );
4197     aExp = extractFloat64Exp( a );
4198     aSign = extractFloat64Sign( a );
4199
4200     if ( aExp == 0 ) {
4201         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4202         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4203     }
4204     if ( aSign ) {
4205         float_raise(float_flag_invalid, status);
4206         return float64_default_nan(status);
4207     }
4208     if ( aExp == 0x7FF ) {
4209         if (aSig) {
4210             return propagateFloat64NaN(a, float64_zero, status);
4211         }
4212         return a;
4213     }
4214
4215     aExp -= 0x3FF;
4216     aSig |= LIT64( 0x0010000000000000 );
4217     zSign = aExp < 0;
4218     zSig = (uint64_t)aExp << 52;
4219     for (i = 1LL << 51; i > 0; i >>= 1) {
4220         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4221         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4222         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4223             aSig >>= 1;
4224             zSig |= i;
4225         }
4226     }
4227
4228     if ( zSign )
4229         zSig = -zSig;
4230     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4231 }
4232
4233 /*----------------------------------------------------------------------------
4234 | Returns 1 if the double-precision floating-point value `a' is equal to the
4235 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4236 | if either operand is a NaN.  Otherwise, the comparison is performed
4237 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4238 *----------------------------------------------------------------------------*/
4239
4240 int float64_eq(float64 a, float64 b, float_status *status)
4241 {
4242     uint64_t av, bv;
4243     a = float64_squash_input_denormal(a, status);
4244     b = float64_squash_input_denormal(b, status);
4245
4246     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4247          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4248        ) {
4249         float_raise(float_flag_invalid, status);
4250         return 0;
4251     }
4252     av = float64_val(a);
4253     bv = float64_val(b);
4254     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4255
4256 }
4257
4258 /*----------------------------------------------------------------------------
4259 | Returns 1 if the double-precision floating-point value `a' is less than or
4260 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4261 | exception is raised if either operand is a NaN.  The comparison is performed
4262 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4263 *----------------------------------------------------------------------------*/
4264
4265 int float64_le(float64 a, float64 b, float_status *status)
4266 {
4267     flag aSign, bSign;
4268     uint64_t av, bv;
4269     a = float64_squash_input_denormal(a, status);
4270     b = float64_squash_input_denormal(b, status);
4271
4272     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4273          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4274        ) {
4275         float_raise(float_flag_invalid, status);
4276         return 0;
4277     }
4278     aSign = extractFloat64Sign( a );
4279     bSign = extractFloat64Sign( b );
4280     av = float64_val(a);
4281     bv = float64_val(b);
4282     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4283     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4284
4285 }
4286
4287 /*----------------------------------------------------------------------------
4288 | Returns 1 if the double-precision floating-point value `a' is less than
4289 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4290 | raised if either operand is a NaN.  The comparison is performed according
4291 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4292 *----------------------------------------------------------------------------*/
4293
4294 int float64_lt(float64 a, float64 b, float_status *status)
4295 {
4296     flag aSign, bSign;
4297     uint64_t av, bv;
4298
4299     a = float64_squash_input_denormal(a, status);
4300     b = float64_squash_input_denormal(b, status);
4301     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4302          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4303        ) {
4304         float_raise(float_flag_invalid, status);
4305         return 0;
4306     }
4307     aSign = extractFloat64Sign( a );
4308     bSign = extractFloat64Sign( b );
4309     av = float64_val(a);
4310     bv = float64_val(b);
4311     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4312     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4313
4314 }
4315
4316 /*----------------------------------------------------------------------------
4317 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4318 | be compared, and 0 otherwise.  The invalid exception is raised if either
4319 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4320 | Standard for Binary Floating-Point Arithmetic.
4321 *----------------------------------------------------------------------------*/
4322
4323 int float64_unordered(float64 a, float64 b, float_status *status)
4324 {
4325     a = float64_squash_input_denormal(a, status);
4326     b = float64_squash_input_denormal(b, status);
4327
4328     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4329          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4330        ) {
4331         float_raise(float_flag_invalid, status);
4332         return 1;
4333     }
4334     return 0;
4335 }
4336
4337 /*----------------------------------------------------------------------------
4338 | Returns 1 if the double-precision floating-point value `a' is equal to the
4339 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4340 | exception.The comparison is performed according to the IEC/IEEE Standard
4341 | for Binary Floating-Point Arithmetic.
4342 *----------------------------------------------------------------------------*/
4343
4344 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4345 {
4346     uint64_t av, bv;
4347     a = float64_squash_input_denormal(a, status);
4348     b = float64_squash_input_denormal(b, status);
4349
4350     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4351          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4352        ) {
4353         if (float64_is_signaling_nan(a, status)
4354          || float64_is_signaling_nan(b, status)) {
4355             float_raise(float_flag_invalid, status);
4356         }
4357         return 0;
4358     }
4359     av = float64_val(a);
4360     bv = float64_val(b);
4361     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4362
4363 }
4364
4365 /*----------------------------------------------------------------------------
4366 | Returns 1 if the double-precision floating-point value `a' is less than or
4367 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4368 | cause an exception.  Otherwise, the comparison is performed according to the
4369 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4370 *----------------------------------------------------------------------------*/
4371
4372 int float64_le_quiet(float64 a, float64 b, float_status *status)
4373 {
4374     flag aSign, bSign;
4375     uint64_t av, bv;
4376     a = float64_squash_input_denormal(a, status);
4377     b = float64_squash_input_denormal(b, status);
4378
4379     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4380          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4381        ) {
4382         if (float64_is_signaling_nan(a, status)
4383          || float64_is_signaling_nan(b, status)) {
4384             float_raise(float_flag_invalid, status);
4385         }
4386         return 0;
4387     }
4388     aSign = extractFloat64Sign( a );
4389     bSign = extractFloat64Sign( b );
4390     av = float64_val(a);
4391     bv = float64_val(b);
4392     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4393     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4394
4395 }
4396
4397 /*----------------------------------------------------------------------------
4398 | Returns 1 if the double-precision floating-point value `a' is less than
4399 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4400 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4401 | Standard for Binary Floating-Point Arithmetic.
4402 *----------------------------------------------------------------------------*/
4403
4404 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4405 {
4406     flag aSign, bSign;
4407     uint64_t av, bv;
4408     a = float64_squash_input_denormal(a, status);
4409     b = float64_squash_input_denormal(b, status);
4410
4411     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4412          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4413        ) {
4414         if (float64_is_signaling_nan(a, status)
4415          || float64_is_signaling_nan(b, status)) {
4416             float_raise(float_flag_invalid, status);
4417         }
4418         return 0;
4419     }
4420     aSign = extractFloat64Sign( a );
4421     bSign = extractFloat64Sign( b );
4422     av = float64_val(a);
4423     bv = float64_val(b);
4424     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4425     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4426
4427 }
4428
4429 /*----------------------------------------------------------------------------
4430 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4431 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4432 | comparison is performed according to the IEC/IEEE Standard for Binary
4433 | Floating-Point Arithmetic.
4434 *----------------------------------------------------------------------------*/
4435
4436 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4437 {
4438     a = float64_squash_input_denormal(a, status);
4439     b = float64_squash_input_denormal(b, status);
4440
4441     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4442          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4443        ) {
4444         if (float64_is_signaling_nan(a, status)
4445          || float64_is_signaling_nan(b, status)) {
4446             float_raise(float_flag_invalid, status);
4447         }
4448         return 1;
4449     }
4450     return 0;
4451 }
4452
4453 /*----------------------------------------------------------------------------
4454 | Returns the result of converting the extended double-precision floating-
4455 | point value `a' to the 32-bit two's complement integer format.  The
4456 | conversion is performed according to the IEC/IEEE Standard for Binary
4457 | Floating-Point Arithmetic---which means in particular that the conversion
4458 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4459 | largest positive integer is returned.  Otherwise, if the conversion
4460 | overflows, the largest integer with the same sign as `a' is returned.
4461 *----------------------------------------------------------------------------*/
4462
4463 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4464 {
4465     flag aSign;
4466     int32_t aExp, shiftCount;
4467     uint64_t aSig;
4468
4469     if (floatx80_invalid_encoding(a)) {
4470         float_raise(float_flag_invalid, status);
4471         return 1 << 31;
4472     }
4473     aSig = extractFloatx80Frac( a );
4474     aExp = extractFloatx80Exp( a );
4475     aSign = extractFloatx80Sign( a );
4476     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4477     shiftCount = 0x4037 - aExp;
4478     if ( shiftCount <= 0 ) shiftCount = 1;
4479     shift64RightJamming( aSig, shiftCount, &aSig );
4480     return roundAndPackInt32(aSign, aSig, status);
4481
4482 }
4483
4484 /*----------------------------------------------------------------------------
4485 | Returns the result of converting the extended double-precision floating-
4486 | point value `a' to the 32-bit two's complement integer format.  The
4487 | conversion is performed according to the IEC/IEEE Standard for Binary
4488 | Floating-Point Arithmetic, except that the conversion is always rounded
4489 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4490 | Otherwise, if the conversion overflows, the largest integer with the same
4491 | sign as `a' is returned.
4492 *----------------------------------------------------------------------------*/
4493
4494 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4495 {
4496     flag aSign;
4497     int32_t aExp, shiftCount;
4498     uint64_t aSig, savedASig;
4499     int32_t z;
4500
4501     if (floatx80_invalid_encoding(a)) {
4502         float_raise(float_flag_invalid, status);
4503         return 1 << 31;
4504     }
4505     aSig = extractFloatx80Frac( a );
4506     aExp = extractFloatx80Exp( a );
4507     aSign = extractFloatx80Sign( a );
4508     if ( 0x401E < aExp ) {
4509         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4510         goto invalid;
4511     }
4512     else if ( aExp < 0x3FFF ) {
4513         if (aExp || aSig) {
4514             status->float_exception_flags |= float_flag_inexact;
4515         }
4516         return 0;
4517     }
4518     shiftCount = 0x403E - aExp;
4519     savedASig = aSig;
4520     aSig >>= shiftCount;
4521     z = aSig;
4522     if ( aSign ) z = - z;
4523     if ( ( z < 0 ) ^ aSign ) {
4524  invalid:
4525         float_raise(float_flag_invalid, status);
4526         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4527     }
4528     if ( ( aSig<<shiftCount ) != savedASig ) {
4529         status->float_exception_flags |= float_flag_inexact;
4530     }
4531     return z;
4532
4533 }
4534
4535 /*----------------------------------------------------------------------------
4536 | Returns the result of converting the extended double-precision floating-
4537 | point value `a' to the 64-bit two's complement integer format.  The
4538 | conversion is performed according to the IEC/IEEE Standard for Binary
4539 | Floating-Point Arithmetic---which means in particular that the conversion
4540 | is rounded according to the current rounding mode.  If `a' is a NaN,
4541 | the largest positive integer is returned.  Otherwise, if the conversion
4542 | overflows, the largest integer with the same sign as `a' is returned.
4543 *----------------------------------------------------------------------------*/
4544
4545 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4546 {
4547     flag aSign;
4548     int32_t aExp, shiftCount;
4549     uint64_t aSig, aSigExtra;
4550
4551     if (floatx80_invalid_encoding(a)) {
4552         float_raise(float_flag_invalid, status);
4553         return 1ULL << 63;
4554     }
4555     aSig = extractFloatx80Frac( a );
4556     aExp = extractFloatx80Exp( a );
4557     aSign = extractFloatx80Sign( a );
4558     shiftCount = 0x403E - aExp;
4559     if ( shiftCount <= 0 ) {
4560         if ( shiftCount ) {
4561             float_raise(float_flag_invalid, status);
4562             if (!aSign || floatx80_is_any_nan(a)) {
4563                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4564             }
4565             return (int64_t) LIT64( 0x8000000000000000 );
4566         }
4567         aSigExtra = 0;
4568     }
4569     else {
4570         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4571     }
4572     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4573
4574 }
4575
4576 /*----------------------------------------------------------------------------
4577 | Returns the result of converting the extended double-precision floating-
4578 | point value `a' to the 64-bit two's complement integer format.  The
4579 | conversion is performed according to the IEC/IEEE Standard for Binary
4580 | Floating-Point Arithmetic, except that the conversion is always rounded
4581 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4582 | Otherwise, if the conversion overflows, the largest integer with the same
4583 | sign as `a' is returned.
4584 *----------------------------------------------------------------------------*/
4585
4586 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4587 {
4588     flag aSign;
4589     int32_t aExp, shiftCount;
4590     uint64_t aSig;
4591     int64_t z;
4592
4593     if (floatx80_invalid_encoding(a)) {
4594         float_raise(float_flag_invalid, status);
4595         return 1ULL << 63;
4596     }
4597     aSig = extractFloatx80Frac( a );
4598     aExp = extractFloatx80Exp( a );
4599     aSign = extractFloatx80Sign( a );
4600     shiftCount = aExp - 0x403E;
4601     if ( 0 <= shiftCount ) {
4602         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4603         if ( ( a.high != 0xC03E ) || aSig ) {
4604             float_raise(float_flag_invalid, status);
4605             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4606                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4607             }
4608         }
4609         return (int64_t) LIT64( 0x8000000000000000 );
4610     }
4611     else if ( aExp < 0x3FFF ) {
4612         if (aExp | aSig) {
4613             status->float_exception_flags |= float_flag_inexact;
4614         }
4615         return 0;
4616     }
4617     z = aSig>>( - shiftCount );
4618     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4619         status->float_exception_flags |= float_flag_inexact;
4620     }
4621     if ( aSign ) z = - z;
4622     return z;
4623
4624 }
4625
4626 /*----------------------------------------------------------------------------
4627 | Returns the result of converting the extended double-precision floating-
4628 | point value `a' to the single-precision floating-point format.  The
4629 | conversion is performed according to the IEC/IEEE Standard for Binary
4630 | Floating-Point Arithmetic.
4631 *----------------------------------------------------------------------------*/
4632
4633 float32 floatx80_to_float32(floatx80 a, float_status *status)
4634 {
4635     flag aSign;
4636     int32_t aExp;
4637     uint64_t aSig;
4638
4639     if (floatx80_invalid_encoding(a)) {
4640         float_raise(float_flag_invalid, status);
4641         return float32_default_nan(status);
4642     }
4643     aSig = extractFloatx80Frac( a );
4644     aExp = extractFloatx80Exp( a );
4645     aSign = extractFloatx80Sign( a );
4646     if ( aExp == 0x7FFF ) {
4647         if ( (uint64_t) ( aSig<<1 ) ) {
4648             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4649         }
4650         return packFloat32( aSign, 0xFF, 0 );
4651     }
4652     shift64RightJamming( aSig, 33, &aSig );
4653     if ( aExp || aSig ) aExp -= 0x3F81;
4654     return roundAndPackFloat32(aSign, aExp, aSig, status);
4655
4656 }
4657
4658 /*----------------------------------------------------------------------------
4659 | Returns the result of converting the extended double-precision floating-
4660 | point value `a' to the double-precision floating-point format.  The
4661 | conversion is performed according to the IEC/IEEE Standard for Binary
4662 | Floating-Point Arithmetic.
4663 *----------------------------------------------------------------------------*/
4664
4665 float64 floatx80_to_float64(floatx80 a, float_status *status)
4666 {
4667     flag aSign;
4668     int32_t aExp;
4669     uint64_t aSig, zSig;
4670
4671     if (floatx80_invalid_encoding(a)) {
4672         float_raise(float_flag_invalid, status);
4673         return float64_default_nan(status);
4674     }
4675     aSig = extractFloatx80Frac( a );
4676     aExp = extractFloatx80Exp( a );
4677     aSign = extractFloatx80Sign( a );
4678     if ( aExp == 0x7FFF ) {
4679         if ( (uint64_t) ( aSig<<1 ) ) {
4680             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4681         }
4682         return packFloat64( aSign, 0x7FF, 0 );
4683     }
4684     shift64RightJamming( aSig, 1, &zSig );
4685     if ( aExp || aSig ) aExp -= 0x3C01;
4686     return roundAndPackFloat64(aSign, aExp, zSig, status);
4687
4688 }
4689
4690 /*----------------------------------------------------------------------------
4691 | Returns the result of converting the extended double-precision floating-
4692 | point value `a' to the quadruple-precision floating-point format.  The
4693 | conversion is performed according to the IEC/IEEE Standard for Binary
4694 | Floating-Point Arithmetic.
4695 *----------------------------------------------------------------------------*/
4696
4697 float128 floatx80_to_float128(floatx80 a, float_status *status)
4698 {
4699     flag aSign;
4700     int aExp;
4701     uint64_t aSig, zSig0, zSig1;
4702
4703     if (floatx80_invalid_encoding(a)) {
4704         float_raise(float_flag_invalid, status);
4705         return float128_default_nan(status);
4706     }
4707     aSig = extractFloatx80Frac( a );
4708     aExp = extractFloatx80Exp( a );
4709     aSign = extractFloatx80Sign( a );
4710     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4711         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4712     }
4713     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4714     return packFloat128( aSign, aExp, zSig0, zSig1 );
4715
4716 }
4717
4718 /*----------------------------------------------------------------------------
4719 | Rounds the extended double-precision floating-point value `a'
4720 | to the precision provided by floatx80_rounding_precision and returns the
4721 | result as an extended double-precision floating-point value.
4722 | The operation is performed according to the IEC/IEEE Standard for Binary
4723 | Floating-Point Arithmetic.
4724 *----------------------------------------------------------------------------*/
4725
4726 floatx80 floatx80_round(floatx80 a, float_status *status)
4727 {
4728     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4729                                 extractFloatx80Sign(a),
4730                                 extractFloatx80Exp(a),
4731                                 extractFloatx80Frac(a), 0, status);
4732 }
4733
4734 /*----------------------------------------------------------------------------
4735 | Rounds the extended double-precision floating-point value `a' to an integer,
4736 | and returns the result as an extended quadruple-precision floating-point
4737 | value.  The operation is performed according to the IEC/IEEE Standard for
4738 | Binary Floating-Point Arithmetic.
4739 *----------------------------------------------------------------------------*/
4740
4741 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4742 {
4743     flag aSign;
4744     int32_t aExp;
4745     uint64_t lastBitMask, roundBitsMask;
4746     floatx80 z;
4747
4748     if (floatx80_invalid_encoding(a)) {
4749         float_raise(float_flag_invalid, status);
4750         return floatx80_default_nan(status);
4751     }
4752     aExp = extractFloatx80Exp( a );
4753     if ( 0x403E <= aExp ) {
4754         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4755             return propagateFloatx80NaN(a, a, status);
4756         }
4757         return a;
4758     }
4759     if ( aExp < 0x3FFF ) {
4760         if (    ( aExp == 0 )
4761              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4762             return a;
4763         }
4764         status->float_exception_flags |= float_flag_inexact;
4765         aSign = extractFloatx80Sign( a );
4766         switch (status->float_rounding_mode) {
4767          case float_round_nearest_even:
4768             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4769                ) {
4770                 return
4771                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4772             }
4773             break;
4774         case float_round_ties_away:
4775             if (aExp == 0x3FFE) {
4776                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4777             }
4778             break;
4779          case float_round_down:
4780             return
4781                   aSign ?
4782                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4783                 : packFloatx80( 0, 0, 0 );
4784          case float_round_up:
4785             return
4786                   aSign ? packFloatx80( 1, 0, 0 )
4787                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4788         }
4789         return packFloatx80( aSign, 0, 0 );
4790     }
4791     lastBitMask = 1;
4792     lastBitMask <<= 0x403E - aExp;
4793     roundBitsMask = lastBitMask - 1;
4794     z = a;
4795     switch (status->float_rounding_mode) {
4796     case float_round_nearest_even:
4797         z.low += lastBitMask>>1;
4798         if ((z.low & roundBitsMask) == 0) {
4799             z.low &= ~lastBitMask;
4800         }
4801         break;
4802     case float_round_ties_away:
4803         z.low += lastBitMask >> 1;
4804         break;
4805     case float_round_to_zero:
4806         break;
4807     case float_round_up:
4808         if (!extractFloatx80Sign(z)) {
4809             z.low += roundBitsMask;
4810         }
4811         break;
4812     case float_round_down:
4813         if (extractFloatx80Sign(z)) {
4814             z.low += roundBitsMask;
4815         }
4816         break;
4817     default:
4818         abort();
4819     }
4820     z.low &= ~ roundBitsMask;
4821     if ( z.low == 0 ) {
4822         ++z.high;
4823         z.low = LIT64( 0x8000000000000000 );
4824     }
4825     if (z.low != a.low) {
4826         status->float_exception_flags |= float_flag_inexact;
4827     }
4828     return z;
4829
4830 }
4831
4832 /*----------------------------------------------------------------------------
4833 | Returns the result of adding the absolute values of the extended double-
4834 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4835 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4836 | The addition is performed according to the IEC/IEEE Standard for Binary
4837 | Floating-Point Arithmetic.
4838 *----------------------------------------------------------------------------*/
4839
4840 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4841                                 float_status *status)
4842 {
4843     int32_t aExp, bExp, zExp;
4844     uint64_t aSig, bSig, zSig0, zSig1;
4845     int32_t expDiff;
4846
4847     aSig = extractFloatx80Frac( a );
4848     aExp = extractFloatx80Exp( a );
4849     bSig = extractFloatx80Frac( b );
4850     bExp = extractFloatx80Exp( b );
4851     expDiff = aExp - bExp;
4852     if ( 0 < expDiff ) {
4853         if ( aExp == 0x7FFF ) {
4854             if ((uint64_t)(aSig << 1)) {
4855                 return propagateFloatx80NaN(a, b, status);
4856             }
4857             return a;
4858         }
4859         if ( bExp == 0 ) --expDiff;
4860         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4861         zExp = aExp;
4862     }
4863     else if ( expDiff < 0 ) {
4864         if ( bExp == 0x7FFF ) {
4865             if ((uint64_t)(bSig << 1)) {
4866                 return propagateFloatx80NaN(a, b, status);
4867             }
4868             return packFloatx80(zSign,
4869                                 floatx80_infinity_high,
4870                                 floatx80_infinity_low);
4871         }
4872         if ( aExp == 0 ) ++expDiff;
4873         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4874         zExp = bExp;
4875     }
4876     else {
4877         if ( aExp == 0x7FFF ) {
4878             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4879                 return propagateFloatx80NaN(a, b, status);
4880             }
4881             return a;
4882         }
4883         zSig1 = 0;
4884         zSig0 = aSig + bSig;
4885         if ( aExp == 0 ) {
4886             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4887             goto roundAndPack;
4888         }
4889         zExp = aExp;
4890         goto shiftRight1;
4891     }
4892     zSig0 = aSig + bSig;
4893     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4894  shiftRight1:
4895     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4896     zSig0 |= LIT64( 0x8000000000000000 );
4897     ++zExp;
4898  roundAndPack:
4899     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4900                                 zSign, zExp, zSig0, zSig1, status);
4901 }
4902
4903 /*----------------------------------------------------------------------------
4904 | Returns the result of subtracting the absolute values of the extended
4905 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4906 | difference is negated before being returned.  `zSign' is ignored if the
4907 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4908 | Standard for Binary Floating-Point Arithmetic.
4909 *----------------------------------------------------------------------------*/
4910
4911 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4912                                 float_status *status)
4913 {
4914     int32_t aExp, bExp, zExp;
4915     uint64_t aSig, bSig, zSig0, zSig1;
4916     int32_t expDiff;
4917
4918     aSig = extractFloatx80Frac( a );
4919     aExp = extractFloatx80Exp( a );
4920     bSig = extractFloatx80Frac( b );
4921     bExp = extractFloatx80Exp( b );
4922     expDiff = aExp - bExp;
4923     if ( 0 < expDiff ) goto aExpBigger;
4924     if ( expDiff < 0 ) goto bExpBigger;
4925     if ( aExp == 0x7FFF ) {
4926         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4927             return propagateFloatx80NaN(a, b, status);
4928         }
4929         float_raise(float_flag_invalid, status);
4930         return floatx80_default_nan(status);
4931     }
4932     if ( aExp == 0 ) {
4933         aExp = 1;
4934         bExp = 1;
4935     }
4936     zSig1 = 0;
4937     if ( bSig < aSig ) goto aBigger;
4938     if ( aSig < bSig ) goto bBigger;
4939     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4940  bExpBigger:
4941     if ( bExp == 0x7FFF ) {
4942         if ((uint64_t)(bSig << 1)) {
4943             return propagateFloatx80NaN(a, b, status);
4944         }
4945         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4946                             floatx80_infinity_low);
4947     }
4948     if ( aExp == 0 ) ++expDiff;
4949     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4950  bBigger:
4951     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4952     zExp = bExp;
4953     zSign ^= 1;
4954     goto normalizeRoundAndPack;
4955  aExpBigger:
4956     if ( aExp == 0x7FFF ) {
4957         if ((uint64_t)(aSig << 1)) {
4958             return propagateFloatx80NaN(a, b, status);
4959         }
4960         return a;
4961     }
4962     if ( bExp == 0 ) --expDiff;
4963     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4964  aBigger:
4965     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4966     zExp = aExp;
4967  normalizeRoundAndPack:
4968     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4969                                          zSign, zExp, zSig0, zSig1, status);
4970 }
4971
4972 /*----------------------------------------------------------------------------
4973 | Returns the result of adding the extended double-precision floating-point
4974 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4975 | Standard for Binary Floating-Point Arithmetic.
4976 *----------------------------------------------------------------------------*/
4977
4978 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4979 {
4980     flag aSign, bSign;
4981
4982     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4983         float_raise(float_flag_invalid, status);
4984         return floatx80_default_nan(status);
4985     }
4986     aSign = extractFloatx80Sign( a );
4987     bSign = extractFloatx80Sign( b );
4988     if ( aSign == bSign ) {
4989         return addFloatx80Sigs(a, b, aSign, status);
4990     }
4991     else {
4992         return subFloatx80Sigs(a, b, aSign, status);
4993     }
4994
4995 }
4996
4997 /*----------------------------------------------------------------------------
4998 | Returns the result of subtracting the extended double-precision floating-
4999 | point values `a' and `b'.  The operation is performed according to the
5000 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5001 *----------------------------------------------------------------------------*/
5002
5003 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5004 {
5005     flag aSign, bSign;
5006
5007     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5008         float_raise(float_flag_invalid, status);
5009         return floatx80_default_nan(status);
5010     }
5011     aSign = extractFloatx80Sign( a );
5012     bSign = extractFloatx80Sign( b );
5013     if ( aSign == bSign ) {
5014         return subFloatx80Sigs(a, b, aSign, status);
5015     }
5016     else {
5017         return addFloatx80Sigs(a, b, aSign, status);
5018     }
5019
5020 }
5021
5022 /*----------------------------------------------------------------------------
5023 | Returns the result of multiplying the extended double-precision floating-
5024 | point values `a' and `b'.  The operation is performed according to the
5025 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5026 *----------------------------------------------------------------------------*/
5027
5028 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5029 {
5030     flag aSign, bSign, zSign;
5031     int32_t aExp, bExp, zExp;
5032     uint64_t aSig, bSig, zSig0, zSig1;
5033
5034     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5035         float_raise(float_flag_invalid, status);
5036         return floatx80_default_nan(status);
5037     }
5038     aSig = extractFloatx80Frac( a );
5039     aExp = extractFloatx80Exp( a );
5040     aSign = extractFloatx80Sign( a );
5041     bSig = extractFloatx80Frac( b );
5042     bExp = extractFloatx80Exp( b );
5043     bSign = extractFloatx80Sign( b );
5044     zSign = aSign ^ bSign;
5045     if ( aExp == 0x7FFF ) {
5046         if (    (uint64_t) ( aSig<<1 )
5047              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5048             return propagateFloatx80NaN(a, b, status);
5049         }
5050         if ( ( bExp | bSig ) == 0 ) goto invalid;
5051         return packFloatx80(zSign, floatx80_infinity_high,
5052                                    floatx80_infinity_low);
5053     }
5054     if ( bExp == 0x7FFF ) {
5055         if ((uint64_t)(bSig << 1)) {
5056             return propagateFloatx80NaN(a, b, status);
5057         }
5058         if ( ( aExp | aSig ) == 0 ) {
5059  invalid:
5060             float_raise(float_flag_invalid, status);
5061             return floatx80_default_nan(status);
5062         }
5063         return packFloatx80(zSign, floatx80_infinity_high,
5064                                    floatx80_infinity_low);
5065     }
5066     if ( aExp == 0 ) {
5067         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5068         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5069     }
5070     if ( bExp == 0 ) {
5071         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5072         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5073     }
5074     zExp = aExp + bExp - 0x3FFE;
5075     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5076     if ( 0 < (int64_t) zSig0 ) {
5077         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5078         --zExp;
5079     }
5080     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5081                                 zSign, zExp, zSig0, zSig1, status);
5082 }
5083
5084 /*----------------------------------------------------------------------------
5085 | Returns the result of dividing the extended double-precision floating-point
5086 | value `a' by the corresponding value `b'.  The operation is performed
5087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5088 *----------------------------------------------------------------------------*/
5089
5090 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5091 {
5092     flag aSign, bSign, zSign;
5093     int32_t aExp, bExp, zExp;
5094     uint64_t aSig, bSig, zSig0, zSig1;
5095     uint64_t rem0, rem1, rem2, term0, term1, term2;
5096
5097     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5098         float_raise(float_flag_invalid, status);
5099         return floatx80_default_nan(status);
5100     }
5101     aSig = extractFloatx80Frac( a );
5102     aExp = extractFloatx80Exp( a );
5103     aSign = extractFloatx80Sign( a );
5104     bSig = extractFloatx80Frac( b );
5105     bExp = extractFloatx80Exp( b );
5106     bSign = extractFloatx80Sign( b );
5107     zSign = aSign ^ bSign;
5108     if ( aExp == 0x7FFF ) {
5109         if ((uint64_t)(aSig << 1)) {
5110             return propagateFloatx80NaN(a, b, status);
5111         }
5112         if ( bExp == 0x7FFF ) {
5113             if ((uint64_t)(bSig << 1)) {
5114                 return propagateFloatx80NaN(a, b, status);
5115             }
5116             goto invalid;
5117         }
5118         return packFloatx80(zSign, floatx80_infinity_high,
5119                                    floatx80_infinity_low);
5120     }
5121     if ( bExp == 0x7FFF ) {
5122         if ((uint64_t)(bSig << 1)) {
5123             return propagateFloatx80NaN(a, b, status);
5124         }
5125         return packFloatx80( zSign, 0, 0 );
5126     }
5127     if ( bExp == 0 ) {
5128         if ( bSig == 0 ) {
5129             if ( ( aExp | aSig ) == 0 ) {
5130  invalid:
5131                 float_raise(float_flag_invalid, status);
5132                 return floatx80_default_nan(status);
5133             }
5134             float_raise(float_flag_divbyzero, status);
5135             return packFloatx80(zSign, floatx80_infinity_high,
5136                                        floatx80_infinity_low);
5137         }
5138         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5139     }
5140     if ( aExp == 0 ) {
5141         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5142         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5143     }
5144     zExp = aExp - bExp + 0x3FFE;
5145     rem1 = 0;
5146     if ( bSig <= aSig ) {
5147         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5148         ++zExp;
5149     }
5150     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5151     mul64To128( bSig, zSig0, &term0, &term1 );
5152     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5153     while ( (int64_t) rem0 < 0 ) {
5154         --zSig0;
5155         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5156     }
5157     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5158     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5159         mul64To128( bSig, zSig1, &term1, &term2 );
5160         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5161         while ( (int64_t) rem1 < 0 ) {
5162             --zSig1;
5163             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5164         }
5165         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5166     }
5167     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5168                                 zSign, zExp, zSig0, zSig1, status);
5169 }
5170
5171 /*----------------------------------------------------------------------------
5172 | Returns the remainder of the extended double-precision floating-point value
5173 | `a' with respect to the corresponding value `b'.  The operation is performed
5174 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5175 *----------------------------------------------------------------------------*/
5176
5177 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5178 {
5179     flag aSign, zSign;
5180     int32_t aExp, bExp, expDiff;
5181     uint64_t aSig0, aSig1, bSig;
5182     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5183
5184     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5185         float_raise(float_flag_invalid, status);
5186         return floatx80_default_nan(status);
5187     }
5188     aSig0 = extractFloatx80Frac( a );
5189     aExp = extractFloatx80Exp( a );
5190     aSign = extractFloatx80Sign( a );
5191     bSig = extractFloatx80Frac( b );
5192     bExp = extractFloatx80Exp( b );
5193     if ( aExp == 0x7FFF ) {
5194         if (    (uint64_t) ( aSig0<<1 )
5195              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5196             return propagateFloatx80NaN(a, b, status);
5197         }
5198         goto invalid;
5199     }
5200     if ( bExp == 0x7FFF ) {
5201         if ((uint64_t)(bSig << 1)) {
5202             return propagateFloatx80NaN(a, b, status);
5203         }
5204         return a;
5205     }
5206     if ( bExp == 0 ) {
5207         if ( bSig == 0 ) {
5208  invalid:
5209             float_raise(float_flag_invalid, status);
5210             return floatx80_default_nan(status);
5211         }
5212         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5213     }
5214     if ( aExp == 0 ) {
5215         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5216         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5217     }
5218     bSig |= LIT64( 0x8000000000000000 );
5219     zSign = aSign;
5220     expDiff = aExp - bExp;
5221     aSig1 = 0;
5222     if ( expDiff < 0 ) {
5223         if ( expDiff < -1 ) return a;
5224         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5225         expDiff = 0;
5226     }
5227     q = ( bSig <= aSig0 );
5228     if ( q ) aSig0 -= bSig;
5229     expDiff -= 64;
5230     while ( 0 < expDiff ) {
5231         q = estimateDiv128To64( aSig0, aSig1, bSig );
5232         q = ( 2 < q ) ? q - 2 : 0;
5233         mul64To128( bSig, q, &term0, &term1 );
5234         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5235         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5236         expDiff -= 62;
5237     }
5238     expDiff += 64;
5239     if ( 0 < expDiff ) {
5240         q = estimateDiv128To64( aSig0, aSig1, bSig );
5241         q = ( 2 < q ) ? q - 2 : 0;
5242         q >>= 64 - expDiff;
5243         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5244         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5245         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5246         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5247             ++q;
5248             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5249         }
5250     }
5251     else {
5252         term1 = 0;
5253         term0 = bSig;
5254     }
5255     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5256     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5257          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5258               && ( q & 1 ) )
5259        ) {
5260         aSig0 = alternateASig0;
5261         aSig1 = alternateASig1;
5262         zSign = ! zSign;
5263     }
5264     return
5265         normalizeRoundAndPackFloatx80(
5266             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5267
5268 }
5269
5270 /*----------------------------------------------------------------------------
5271 | Returns the square root of the extended double-precision floating-point
5272 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5273 | for Binary Floating-Point Arithmetic.
5274 *----------------------------------------------------------------------------*/
5275
5276 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5277 {
5278     flag aSign;
5279     int32_t aExp, zExp;
5280     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5281     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5282
5283     if (floatx80_invalid_encoding(a)) {
5284         float_raise(float_flag_invalid, status);
5285         return floatx80_default_nan(status);
5286     }
5287     aSig0 = extractFloatx80Frac( a );
5288     aExp = extractFloatx80Exp( a );
5289     aSign = extractFloatx80Sign( a );
5290     if ( aExp == 0x7FFF ) {
5291         if ((uint64_t)(aSig0 << 1)) {
5292             return propagateFloatx80NaN(a, a, status);
5293         }
5294         if ( ! aSign ) return a;
5295         goto invalid;
5296     }
5297     if ( aSign ) {
5298         if ( ( aExp | aSig0 ) == 0 ) return a;
5299  invalid:
5300         float_raise(float_flag_invalid, status);
5301         return floatx80_default_nan(status);
5302     }
5303     if ( aExp == 0 ) {
5304         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5305         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5306     }
5307     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5308     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5309     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5310     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5311     doubleZSig0 = zSig0<<1;
5312     mul64To128( zSig0, zSig0, &term0, &term1 );
5313     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5314     while ( (int64_t) rem0 < 0 ) {
5315         --zSig0;
5316         doubleZSig0 -= 2;
5317         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5318     }
5319     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5320     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5321         if ( zSig1 == 0 ) zSig1 = 1;
5322         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5323         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5324         mul64To128( zSig1, zSig1, &term2, &term3 );
5325         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5326         while ( (int64_t) rem1 < 0 ) {
5327             --zSig1;
5328             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5329             term3 |= 1;
5330             term2 |= doubleZSig0;
5331             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5332         }
5333         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5334     }
5335     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5336     zSig0 |= doubleZSig0;
5337     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5338                                 0, zExp, zSig0, zSig1, status);
5339 }
5340
5341 /*----------------------------------------------------------------------------
5342 | Returns 1 if the extended double-precision floating-point value `a' is equal
5343 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5344 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5346 *----------------------------------------------------------------------------*/
5347
5348 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5349 {
5350
5351     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5352         || (extractFloatx80Exp(a) == 0x7FFF
5353             && (uint64_t) (extractFloatx80Frac(a) << 1))
5354         || (extractFloatx80Exp(b) == 0x7FFF
5355             && (uint64_t) (extractFloatx80Frac(b) << 1))
5356        ) {
5357         float_raise(float_flag_invalid, status);
5358         return 0;
5359     }
5360     return
5361            ( a.low == b.low )
5362         && (    ( a.high == b.high )
5363              || (    ( a.low == 0 )
5364                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5365            );
5366
5367 }
5368
5369 /*----------------------------------------------------------------------------
5370 | Returns 1 if the extended double-precision floating-point value `a' is
5371 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5372 | invalid exception is raised if either operand is a NaN.  The comparison is
5373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5374 | Arithmetic.
5375 *----------------------------------------------------------------------------*/
5376
5377 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5378 {
5379     flag aSign, bSign;
5380
5381     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5382         || (extractFloatx80Exp(a) == 0x7FFF
5383             && (uint64_t) (extractFloatx80Frac(a) << 1))
5384         || (extractFloatx80Exp(b) == 0x7FFF
5385             && (uint64_t) (extractFloatx80Frac(b) << 1))
5386        ) {
5387         float_raise(float_flag_invalid, status);
5388         return 0;
5389     }
5390     aSign = extractFloatx80Sign( a );
5391     bSign = extractFloatx80Sign( b );
5392     if ( aSign != bSign ) {
5393         return
5394                aSign
5395             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5396                  == 0 );
5397     }
5398     return
5399           aSign ? le128( b.high, b.low, a.high, a.low )
5400         : le128( a.high, a.low, b.high, b.low );
5401
5402 }
5403
5404 /*----------------------------------------------------------------------------
5405 | Returns 1 if the extended double-precision floating-point value `a' is
5406 | less than the corresponding value `b', and 0 otherwise.  The invalid
5407 | exception is raised if either operand is a NaN.  The comparison is performed
5408 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5409 *----------------------------------------------------------------------------*/
5410
5411 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5412 {
5413     flag aSign, bSign;
5414
5415     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5416         || (extractFloatx80Exp(a) == 0x7FFF
5417             && (uint64_t) (extractFloatx80Frac(a) << 1))
5418         || (extractFloatx80Exp(b) == 0x7FFF
5419             && (uint64_t) (extractFloatx80Frac(b) << 1))
5420        ) {
5421         float_raise(float_flag_invalid, status);
5422         return 0;
5423     }
5424     aSign = extractFloatx80Sign( a );
5425     bSign = extractFloatx80Sign( b );
5426     if ( aSign != bSign ) {
5427         return
5428                aSign
5429             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5430                  != 0 );
5431     }
5432     return
5433           aSign ? lt128( b.high, b.low, a.high, a.low )
5434         : lt128( a.high, a.low, b.high, b.low );
5435
5436 }
5437
5438 /*----------------------------------------------------------------------------
5439 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5440 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5441 | either operand is a NaN.   The comparison is performed according to the
5442 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5443 *----------------------------------------------------------------------------*/
5444 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5445 {
5446     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5447         || (extractFloatx80Exp(a) == 0x7FFF
5448             && (uint64_t) (extractFloatx80Frac(a) << 1))
5449         || (extractFloatx80Exp(b) == 0x7FFF
5450             && (uint64_t) (extractFloatx80Frac(b) << 1))
5451        ) {
5452         float_raise(float_flag_invalid, status);
5453         return 1;
5454     }
5455     return 0;
5456 }
5457
5458 /*----------------------------------------------------------------------------
5459 | Returns 1 if the extended double-precision floating-point value `a' is
5460 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5461 | cause an exception.  The comparison is performed according to the IEC/IEEE
5462 | Standard for Binary Floating-Point Arithmetic.
5463 *----------------------------------------------------------------------------*/
5464
5465 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5466 {
5467
5468     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5469         float_raise(float_flag_invalid, status);
5470         return 0;
5471     }
5472     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5473               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5474          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5475               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5476        ) {
5477         if (floatx80_is_signaling_nan(a, status)
5478          || floatx80_is_signaling_nan(b, status)) {
5479             float_raise(float_flag_invalid, status);
5480         }
5481         return 0;
5482     }
5483     return
5484            ( a.low == b.low )
5485         && (    ( a.high == b.high )
5486              || (    ( a.low == 0 )
5487                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5488            );
5489
5490 }
5491
5492 /*----------------------------------------------------------------------------
5493 | Returns 1 if the extended double-precision floating-point value `a' is less
5494 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5495 | do not cause an exception.  Otherwise, the comparison is performed according
5496 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5497 *----------------------------------------------------------------------------*/
5498
5499 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5500 {
5501     flag aSign, bSign;
5502
5503     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5504         float_raise(float_flag_invalid, status);
5505         return 0;
5506     }
5507     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5508               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5509          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5510               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5511        ) {
5512         if (floatx80_is_signaling_nan(a, status)
5513          || floatx80_is_signaling_nan(b, status)) {
5514             float_raise(float_flag_invalid, status);
5515         }
5516         return 0;
5517     }
5518     aSign = extractFloatx80Sign( a );
5519     bSign = extractFloatx80Sign( b );
5520     if ( aSign != bSign ) {
5521         return
5522                aSign
5523             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5524                  == 0 );
5525     }
5526     return
5527           aSign ? le128( b.high, b.low, a.high, a.low )
5528         : le128( a.high, a.low, b.high, b.low );
5529
5530 }
5531
5532 /*----------------------------------------------------------------------------
5533 | Returns 1 if the extended double-precision floating-point value `a' is less
5534 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5535 | an exception.  Otherwise, the comparison is performed according to the
5536 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5537 *----------------------------------------------------------------------------*/
5538
5539 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5540 {
5541     flag aSign, bSign;
5542
5543     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5544         float_raise(float_flag_invalid, status);
5545         return 0;
5546     }
5547     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5548               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5549          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5550               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5551        ) {
5552         if (floatx80_is_signaling_nan(a, status)
5553          || floatx80_is_signaling_nan(b, status)) {
5554             float_raise(float_flag_invalid, status);
5555         }
5556         return 0;
5557     }
5558     aSign = extractFloatx80Sign( a );
5559     bSign = extractFloatx80Sign( b );
5560     if ( aSign != bSign ) {
5561         return
5562                aSign
5563             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5564                  != 0 );
5565     }
5566     return
5567           aSign ? lt128( b.high, b.low, a.high, a.low )
5568         : lt128( a.high, a.low, b.high, b.low );
5569
5570 }
5571
5572 /*----------------------------------------------------------------------------
5573 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5574 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5575 | The comparison is performed according to the IEC/IEEE Standard for Binary
5576 | Floating-Point Arithmetic.
5577 *----------------------------------------------------------------------------*/
5578 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5579 {
5580     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5581         float_raise(float_flag_invalid, status);
5582         return 1;
5583     }
5584     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5585               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5586          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5587               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5588        ) {
5589         if (floatx80_is_signaling_nan(a, status)
5590          || floatx80_is_signaling_nan(b, status)) {
5591             float_raise(float_flag_invalid, status);
5592         }
5593         return 1;
5594     }
5595     return 0;
5596 }
5597
5598 /*----------------------------------------------------------------------------
5599 | Returns the result of converting the quadruple-precision floating-point
5600 | value `a' to the 32-bit two's complement integer format.  The conversion
5601 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5602 | Arithmetic---which means in particular that the conversion is rounded
5603 | according to the current rounding mode.  If `a' is a NaN, the largest
5604 | positive integer is returned.  Otherwise, if the conversion overflows, the
5605 | largest integer with the same sign as `a' is returned.
5606 *----------------------------------------------------------------------------*/
5607
5608 int32_t float128_to_int32(float128 a, float_status *status)
5609 {
5610     flag aSign;
5611     int32_t aExp, shiftCount;
5612     uint64_t aSig0, aSig1;
5613
5614     aSig1 = extractFloat128Frac1( a );
5615     aSig0 = extractFloat128Frac0( a );
5616     aExp = extractFloat128Exp( a );
5617     aSign = extractFloat128Sign( a );
5618     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5619     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5620     aSig0 |= ( aSig1 != 0 );
5621     shiftCount = 0x4028 - aExp;
5622     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5623     return roundAndPackInt32(aSign, aSig0, status);
5624
5625 }
5626
5627 /*----------------------------------------------------------------------------
5628 | Returns the result of converting the quadruple-precision floating-point
5629 | value `a' to the 32-bit two's complement integer format.  The conversion
5630 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5631 | Arithmetic, except that the conversion is always rounded toward zero.  If
5632 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5633 | conversion overflows, the largest integer with the same sign as `a' is
5634 | returned.
5635 *----------------------------------------------------------------------------*/
5636
5637 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5638 {
5639     flag aSign;
5640     int32_t aExp, shiftCount;
5641     uint64_t aSig0, aSig1, savedASig;
5642     int32_t z;
5643
5644     aSig1 = extractFloat128Frac1( a );
5645     aSig0 = extractFloat128Frac0( a );
5646     aExp = extractFloat128Exp( a );
5647     aSign = extractFloat128Sign( a );
5648     aSig0 |= ( aSig1 != 0 );
5649     if ( 0x401E < aExp ) {
5650         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5651         goto invalid;
5652     }
5653     else if ( aExp < 0x3FFF ) {
5654         if (aExp || aSig0) {
5655             status->float_exception_flags |= float_flag_inexact;
5656         }
5657         return 0;
5658     }
5659     aSig0 |= LIT64( 0x0001000000000000 );
5660     shiftCount = 0x402F - aExp;
5661     savedASig = aSig0;
5662     aSig0 >>= shiftCount;
5663     z = aSig0;
5664     if ( aSign ) z = - z;
5665     if ( ( z < 0 ) ^ aSign ) {
5666  invalid:
5667         float_raise(float_flag_invalid, status);
5668         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5669     }
5670     if ( ( aSig0<<shiftCount ) != savedASig ) {
5671         status->float_exception_flags |= float_flag_inexact;
5672     }
5673     return z;
5674
5675 }
5676
5677 /*----------------------------------------------------------------------------
5678 | Returns the result of converting the quadruple-precision floating-point
5679 | value `a' to the 64-bit two's complement integer format.  The conversion
5680 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5681 | Arithmetic---which means in particular that the conversion is rounded
5682 | according to the current rounding mode.  If `a' is a NaN, the largest
5683 | positive integer is returned.  Otherwise, if the conversion overflows, the
5684 | largest integer with the same sign as `a' is returned.
5685 *----------------------------------------------------------------------------*/
5686
5687 int64_t float128_to_int64(float128 a, float_status *status)
5688 {
5689     flag aSign;
5690     int32_t aExp, shiftCount;
5691     uint64_t aSig0, aSig1;
5692
5693     aSig1 = extractFloat128Frac1( a );
5694     aSig0 = extractFloat128Frac0( a );
5695     aExp = extractFloat128Exp( a );
5696     aSign = extractFloat128Sign( a );
5697     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5698     shiftCount = 0x402F - aExp;
5699     if ( shiftCount <= 0 ) {
5700         if ( 0x403E < aExp ) {
5701             float_raise(float_flag_invalid, status);
5702             if (    ! aSign
5703                  || (    ( aExp == 0x7FFF )
5704                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5705                     )
5706                ) {
5707                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5708             }
5709             return (int64_t) LIT64( 0x8000000000000000 );
5710         }
5711         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5712     }
5713     else {
5714         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5715     }
5716     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5717
5718 }
5719
5720 /*----------------------------------------------------------------------------
5721 | Returns the result of converting the quadruple-precision floating-point
5722 | value `a' to the 64-bit two's complement integer format.  The conversion
5723 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5724 | Arithmetic, except that the conversion is always rounded toward zero.
5725 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5726 | the conversion overflows, the largest integer with the same sign as `a' is
5727 | returned.
5728 *----------------------------------------------------------------------------*/
5729
5730 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5731 {
5732     flag aSign;
5733     int32_t aExp, shiftCount;
5734     uint64_t aSig0, aSig1;
5735     int64_t z;
5736
5737     aSig1 = extractFloat128Frac1( a );
5738     aSig0 = extractFloat128Frac0( a );
5739     aExp = extractFloat128Exp( a );
5740     aSign = extractFloat128Sign( a );
5741     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5742     shiftCount = aExp - 0x402F;
5743     if ( 0 < shiftCount ) {
5744         if ( 0x403E <= aExp ) {
5745             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5746             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5747                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5748                 if (aSig1) {
5749                     status->float_exception_flags |= float_flag_inexact;
5750                 }
5751             }
5752             else {
5753                 float_raise(float_flag_invalid, status);
5754                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5755                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5756                 }
5757             }
5758             return (int64_t) LIT64( 0x8000000000000000 );
5759         }
5760         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5761         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5762             status->float_exception_flags |= float_flag_inexact;
5763         }
5764     }
5765     else {
5766         if ( aExp < 0x3FFF ) {
5767             if ( aExp | aSig0 | aSig1 ) {
5768                 status->float_exception_flags |= float_flag_inexact;
5769             }
5770             return 0;
5771         }
5772         z = aSig0>>( - shiftCount );
5773         if (    aSig1
5774              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5775             status->float_exception_flags |= float_flag_inexact;
5776         }
5777     }
5778     if ( aSign ) z = - z;
5779     return z;
5780
5781 }
5782
5783 /*----------------------------------------------------------------------------
5784 | Returns the result of converting the quadruple-precision floating-point value
5785 | `a' to the 64-bit unsigned integer format.  The conversion is
5786 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5787 | Arithmetic---which means in particular that the conversion is rounded
5788 | according to the current rounding mode.  If `a' is a NaN, the largest
5789 | positive integer is returned.  If the conversion overflows, the
5790 | largest unsigned integer is returned.  If 'a' is negative, the value is
5791 | rounded and zero is returned; negative values that do not round to zero
5792 | will raise the inexact exception.
5793 *----------------------------------------------------------------------------*/
5794
5795 uint64_t float128_to_uint64(float128 a, float_status *status)
5796 {
5797     flag aSign;
5798     int aExp;
5799     int shiftCount;
5800     uint64_t aSig0, aSig1;
5801
5802     aSig0 = extractFloat128Frac0(a);
5803     aSig1 = extractFloat128Frac1(a);
5804     aExp = extractFloat128Exp(a);
5805     aSign = extractFloat128Sign(a);
5806     if (aSign && (aExp > 0x3FFE)) {
5807         float_raise(float_flag_invalid, status);
5808         if (float128_is_any_nan(a)) {
5809             return LIT64(0xFFFFFFFFFFFFFFFF);
5810         } else {
5811             return 0;
5812         }
5813     }
5814     if (aExp) {
5815         aSig0 |= LIT64(0x0001000000000000);
5816     }
5817     shiftCount = 0x402F - aExp;
5818     if (shiftCount <= 0) {
5819         if (0x403E < aExp) {
5820             float_raise(float_flag_invalid, status);
5821             return LIT64(0xFFFFFFFFFFFFFFFF);
5822         }
5823         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5824     } else {
5825         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5826     }
5827     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5828 }
5829
5830 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5831 {
5832     uint64_t v;
5833     signed char current_rounding_mode = status->float_rounding_mode;
5834
5835     set_float_rounding_mode(float_round_to_zero, status);
5836     v = float128_to_uint64(a, status);
5837     set_float_rounding_mode(current_rounding_mode, status);
5838
5839     return v;
5840 }
5841
5842 /*----------------------------------------------------------------------------
5843 | Returns the result of converting the quadruple-precision floating-point
5844 | value `a' to the 32-bit unsigned integer format.  The conversion
5845 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5846 | Arithmetic except that the conversion is always rounded toward zero.
5847 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5848 | if the conversion overflows, the largest unsigned integer is returned.
5849 | If 'a' is negative, the value is rounded and zero is returned; negative
5850 | values that do not round to zero will raise the inexact exception.
5851 *----------------------------------------------------------------------------*/
5852
5853 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5854 {
5855     uint64_t v;
5856     uint32_t res;
5857     int old_exc_flags = get_float_exception_flags(status);
5858
5859     v = float128_to_uint64_round_to_zero(a, status);
5860     if (v > 0xffffffff) {
5861         res = 0xffffffff;
5862     } else {
5863         return v;
5864     }
5865     set_float_exception_flags(old_exc_flags, status);
5866     float_raise(float_flag_invalid, status);
5867     return res;
5868 }
5869
5870 /*----------------------------------------------------------------------------
5871 | Returns the result of converting the quadruple-precision floating-point
5872 | value `a' to the single-precision floating-point format.  The conversion
5873 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5874 | Arithmetic.
5875 *----------------------------------------------------------------------------*/
5876
5877 float32 float128_to_float32(float128 a, float_status *status)
5878 {
5879     flag aSign;
5880     int32_t aExp;
5881     uint64_t aSig0, aSig1;
5882     uint32_t zSig;
5883
5884     aSig1 = extractFloat128Frac1( a );
5885     aSig0 = extractFloat128Frac0( a );
5886     aExp = extractFloat128Exp( a );
5887     aSign = extractFloat128Sign( a );
5888     if ( aExp == 0x7FFF ) {
5889         if ( aSig0 | aSig1 ) {
5890             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5891         }
5892         return packFloat32( aSign, 0xFF, 0 );
5893     }
5894     aSig0 |= ( aSig1 != 0 );
5895     shift64RightJamming( aSig0, 18, &aSig0 );
5896     zSig = aSig0;
5897     if ( aExp || zSig ) {
5898         zSig |= 0x40000000;
5899         aExp -= 0x3F81;
5900     }
5901     return roundAndPackFloat32(aSign, aExp, zSig, status);
5902
5903 }
5904
5905 /*----------------------------------------------------------------------------
5906 | Returns the result of converting the quadruple-precision floating-point
5907 | value `a' to the double-precision floating-point format.  The conversion
5908 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5909 | Arithmetic.
5910 *----------------------------------------------------------------------------*/
5911
5912 float64 float128_to_float64(float128 a, float_status *status)
5913 {
5914     flag aSign;
5915     int32_t aExp;
5916     uint64_t aSig0, aSig1;
5917
5918     aSig1 = extractFloat128Frac1( a );
5919     aSig0 = extractFloat128Frac0( a );
5920     aExp = extractFloat128Exp( a );
5921     aSign = extractFloat128Sign( a );
5922     if ( aExp == 0x7FFF ) {
5923         if ( aSig0 | aSig1 ) {
5924             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5925         }
5926         return packFloat64( aSign, 0x7FF, 0 );
5927     }
5928     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5929     aSig0 |= ( aSig1 != 0 );
5930     if ( aExp || aSig0 ) {
5931         aSig0 |= LIT64( 0x4000000000000000 );
5932         aExp -= 0x3C01;
5933     }
5934     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5935
5936 }
5937
5938 /*----------------------------------------------------------------------------
5939 | Returns the result of converting the quadruple-precision floating-point
5940 | value `a' to the extended double-precision floating-point format.  The
5941 | conversion is performed according to the IEC/IEEE Standard for Binary
5942 | Floating-Point Arithmetic.
5943 *----------------------------------------------------------------------------*/
5944
5945 floatx80 float128_to_floatx80(float128 a, float_status *status)
5946 {
5947     flag aSign;
5948     int32_t aExp;
5949     uint64_t aSig0, aSig1;
5950
5951     aSig1 = extractFloat128Frac1( a );
5952     aSig0 = extractFloat128Frac0( a );
5953     aExp = extractFloat128Exp( a );
5954     aSign = extractFloat128Sign( a );
5955     if ( aExp == 0x7FFF ) {
5956         if ( aSig0 | aSig1 ) {
5957             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5958         }
5959         return packFloatx80(aSign, floatx80_infinity_high,
5960                                    floatx80_infinity_low);
5961     }
5962     if ( aExp == 0 ) {
5963         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5964         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5965     }
5966     else {
5967         aSig0 |= LIT64( 0x0001000000000000 );
5968     }
5969     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5970     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5971
5972 }
5973
5974 /*----------------------------------------------------------------------------
5975 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5976 | returns the result as a quadruple-precision floating-point value.  The
5977 | operation is performed according to the IEC/IEEE Standard for Binary
5978 | Floating-Point Arithmetic.
5979 *----------------------------------------------------------------------------*/
5980
5981 float128 float128_round_to_int(float128 a, float_status *status)
5982 {
5983     flag aSign;
5984     int32_t aExp;
5985     uint64_t lastBitMask, roundBitsMask;
5986     float128 z;
5987
5988     aExp = extractFloat128Exp( a );
5989     if ( 0x402F <= aExp ) {
5990         if ( 0x406F <= aExp ) {
5991             if (    ( aExp == 0x7FFF )
5992                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5993                ) {
5994                 return propagateFloat128NaN(a, a, status);
5995             }
5996             return a;
5997         }
5998         lastBitMask = 1;
5999         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6000         roundBitsMask = lastBitMask - 1;
6001         z = a;
6002         switch (status->float_rounding_mode) {
6003         case float_round_nearest_even:
6004             if ( lastBitMask ) {
6005                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6006                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6007             }
6008             else {
6009                 if ( (int64_t) z.low < 0 ) {
6010                     ++z.high;
6011                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6012                 }
6013             }
6014             break;
6015         case float_round_ties_away:
6016             if (lastBitMask) {
6017                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6018             } else {
6019                 if ((int64_t) z.low < 0) {
6020                     ++z.high;
6021                 }
6022             }
6023             break;
6024         case float_round_to_zero:
6025             break;
6026         case float_round_up:
6027             if (!extractFloat128Sign(z)) {
6028                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6029             }
6030             break;
6031         case float_round_down:
6032             if (extractFloat128Sign(z)) {
6033                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6034             }
6035             break;
6036         default:
6037             abort();
6038         }
6039         z.low &= ~ roundBitsMask;
6040     }
6041     else {
6042         if ( aExp < 0x3FFF ) {
6043             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6044             status->float_exception_flags |= float_flag_inexact;
6045             aSign = extractFloat128Sign( a );
6046             switch (status->float_rounding_mode) {
6047              case float_round_nearest_even:
6048                 if (    ( aExp == 0x3FFE )
6049                      && (   extractFloat128Frac0( a )
6050                           | extractFloat128Frac1( a ) )
6051                    ) {
6052                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6053                 }
6054                 break;
6055             case float_round_ties_away:
6056                 if (aExp == 0x3FFE) {
6057                     return packFloat128(aSign, 0x3FFF, 0, 0);
6058                 }
6059                 break;
6060              case float_round_down:
6061                 return
6062                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6063                     : packFloat128( 0, 0, 0, 0 );
6064              case float_round_up:
6065                 return
6066                       aSign ? packFloat128( 1, 0, 0, 0 )
6067                     : packFloat128( 0, 0x3FFF, 0, 0 );
6068             }
6069             return packFloat128( aSign, 0, 0, 0 );
6070         }
6071         lastBitMask = 1;
6072         lastBitMask <<= 0x402F - aExp;
6073         roundBitsMask = lastBitMask - 1;
6074         z.low = 0;
6075         z.high = a.high;
6076         switch (status->float_rounding_mode) {
6077         case float_round_nearest_even:
6078             z.high += lastBitMask>>1;
6079             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6080                 z.high &= ~ lastBitMask;
6081             }
6082             break;
6083         case float_round_ties_away:
6084             z.high += lastBitMask>>1;
6085             break;
6086         case float_round_to_zero:
6087             break;
6088         case float_round_up:
6089             if (!extractFloat128Sign(z)) {
6090                 z.high |= ( a.low != 0 );
6091                 z.high += roundBitsMask;
6092             }
6093             break;
6094         case float_round_down:
6095             if (extractFloat128Sign(z)) {
6096                 z.high |= (a.low != 0);
6097                 z.high += roundBitsMask;
6098             }
6099             break;
6100         default:
6101             abort();
6102         }
6103         z.high &= ~ roundBitsMask;
6104     }
6105     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6106         status->float_exception_flags |= float_flag_inexact;
6107     }
6108     return z;
6109
6110 }
6111
6112 /*----------------------------------------------------------------------------
6113 | Returns the result of adding the absolute values of the quadruple-precision
6114 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6115 | before being returned.  `zSign' is ignored if the result is a NaN.
6116 | The addition is performed according to the IEC/IEEE Standard for Binary
6117 | Floating-Point Arithmetic.
6118 *----------------------------------------------------------------------------*/
6119
6120 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6121                                 float_status *status)
6122 {
6123     int32_t aExp, bExp, zExp;
6124     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6125     int32_t expDiff;
6126
6127     aSig1 = extractFloat128Frac1( a );
6128     aSig0 = extractFloat128Frac0( a );
6129     aExp = extractFloat128Exp( a );
6130     bSig1 = extractFloat128Frac1( b );
6131     bSig0 = extractFloat128Frac0( b );
6132     bExp = extractFloat128Exp( b );
6133     expDiff = aExp - bExp;
6134     if ( 0 < expDiff ) {
6135         if ( aExp == 0x7FFF ) {
6136             if (aSig0 | aSig1) {
6137                 return propagateFloat128NaN(a, b, status);
6138             }
6139             return a;
6140         }
6141         if ( bExp == 0 ) {
6142             --expDiff;
6143         }
6144         else {
6145             bSig0 |= LIT64( 0x0001000000000000 );
6146         }
6147         shift128ExtraRightJamming(
6148             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6149         zExp = aExp;
6150     }
6151     else if ( expDiff < 0 ) {
6152         if ( bExp == 0x7FFF ) {
6153             if (bSig0 | bSig1) {
6154                 return propagateFloat128NaN(a, b, status);
6155             }
6156             return packFloat128( zSign, 0x7FFF, 0, 0 );
6157         }
6158         if ( aExp == 0 ) {
6159             ++expDiff;
6160         }
6161         else {
6162             aSig0 |= LIT64( 0x0001000000000000 );
6163         }
6164         shift128ExtraRightJamming(
6165             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6166         zExp = bExp;
6167     }
6168     else {
6169         if ( aExp == 0x7FFF ) {
6170             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6171                 return propagateFloat128NaN(a, b, status);
6172             }
6173             return a;
6174         }
6175         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6176         if ( aExp == 0 ) {
6177             if (status->flush_to_zero) {
6178                 if (zSig0 | zSig1) {
6179                     float_raise(float_flag_output_denormal, status);
6180                 }
6181                 return packFloat128(zSign, 0, 0, 0);
6182             }
6183             return packFloat128( zSign, 0, zSig0, zSig1 );
6184         }
6185         zSig2 = 0;
6186         zSig0 |= LIT64( 0x0002000000000000 );
6187         zExp = aExp;
6188         goto shiftRight1;
6189     }
6190     aSig0 |= LIT64( 0x0001000000000000 );
6191     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6192     --zExp;
6193     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6194     ++zExp;
6195  shiftRight1:
6196     shift128ExtraRightJamming(
6197         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6198  roundAndPack:
6199     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6200
6201 }
6202
6203 /*----------------------------------------------------------------------------
6204 | Returns the result of subtracting the absolute values of the quadruple-
6205 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6206 | difference is negated before being returned.  `zSign' is ignored if the
6207 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6208 | Standard for Binary Floating-Point Arithmetic.
6209 *----------------------------------------------------------------------------*/
6210
6211 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6212                                 float_status *status)
6213 {
6214     int32_t aExp, bExp, zExp;
6215     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6216     int32_t expDiff;
6217
6218     aSig1 = extractFloat128Frac1( a );
6219     aSig0 = extractFloat128Frac0( a );
6220     aExp = extractFloat128Exp( a );
6221     bSig1 = extractFloat128Frac1( b );
6222     bSig0 = extractFloat128Frac0( b );
6223     bExp = extractFloat128Exp( b );
6224     expDiff = aExp - bExp;
6225     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6226     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6227     if ( 0 < expDiff ) goto aExpBigger;
6228     if ( expDiff < 0 ) goto bExpBigger;
6229     if ( aExp == 0x7FFF ) {
6230         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6231             return propagateFloat128NaN(a, b, status);
6232         }
6233         float_raise(float_flag_invalid, status);
6234         return float128_default_nan(status);
6235     }
6236     if ( aExp == 0 ) {
6237         aExp = 1;
6238         bExp = 1;
6239     }
6240     if ( bSig0 < aSig0 ) goto aBigger;
6241     if ( aSig0 < bSig0 ) goto bBigger;
6242     if ( bSig1 < aSig1 ) goto aBigger;
6243     if ( aSig1 < bSig1 ) goto bBigger;
6244     return packFloat128(status->float_rounding_mode == float_round_down,
6245                         0, 0, 0);
6246  bExpBigger:
6247     if ( bExp == 0x7FFF ) {
6248         if (bSig0 | bSig1) {
6249             return propagateFloat128NaN(a, b, status);
6250         }
6251         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6252     }
6253     if ( aExp == 0 ) {
6254         ++expDiff;
6255     }
6256     else {
6257         aSig0 |= LIT64( 0x4000000000000000 );
6258     }
6259     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6260     bSig0 |= LIT64( 0x4000000000000000 );
6261  bBigger:
6262     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6263     zExp = bExp;
6264     zSign ^= 1;
6265     goto normalizeRoundAndPack;
6266  aExpBigger:
6267     if ( aExp == 0x7FFF ) {
6268         if (aSig0 | aSig1) {
6269             return propagateFloat128NaN(a, b, status);
6270         }
6271         return a;
6272     }
6273     if ( bExp == 0 ) {
6274         --expDiff;
6275     }
6276     else {
6277         bSig0 |= LIT64( 0x4000000000000000 );
6278     }
6279     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6280     aSig0 |= LIT64( 0x4000000000000000 );
6281  aBigger:
6282     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6283     zExp = aExp;
6284  normalizeRoundAndPack:
6285     --zExp;
6286     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6287                                          status);
6288
6289 }
6290
6291 /*----------------------------------------------------------------------------
6292 | Returns the result of adding the quadruple-precision floating-point values
6293 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6294 | for Binary Floating-Point Arithmetic.
6295 *----------------------------------------------------------------------------*/
6296
6297 float128 float128_add(float128 a, float128 b, float_status *status)
6298 {
6299     flag aSign, bSign;
6300
6301     aSign = extractFloat128Sign( a );
6302     bSign = extractFloat128Sign( b );
6303     if ( aSign == bSign ) {
6304         return addFloat128Sigs(a, b, aSign, status);
6305     }
6306     else {
6307         return subFloat128Sigs(a, b, aSign, status);
6308     }
6309
6310 }
6311
6312 /*----------------------------------------------------------------------------
6313 | Returns the result of subtracting the quadruple-precision floating-point
6314 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6315 | Standard for Binary Floating-Point Arithmetic.
6316 *----------------------------------------------------------------------------*/
6317
6318 float128 float128_sub(float128 a, float128 b, float_status *status)
6319 {
6320     flag aSign, bSign;
6321
6322     aSign = extractFloat128Sign( a );
6323     bSign = extractFloat128Sign( b );
6324     if ( aSign == bSign ) {
6325         return subFloat128Sigs(a, b, aSign, status);
6326     }
6327     else {
6328         return addFloat128Sigs(a, b, aSign, status);
6329     }
6330
6331 }
6332
6333 /*----------------------------------------------------------------------------
6334 | Returns the result of multiplying the quadruple-precision floating-point
6335 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6336 | Standard for Binary Floating-Point Arithmetic.
6337 *----------------------------------------------------------------------------*/
6338
6339 float128 float128_mul(float128 a, float128 b, float_status *status)
6340 {
6341     flag aSign, bSign, zSign;
6342     int32_t aExp, bExp, zExp;
6343     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6344
6345     aSig1 = extractFloat128Frac1( a );
6346     aSig0 = extractFloat128Frac0( a );
6347     aExp = extractFloat128Exp( a );
6348     aSign = extractFloat128Sign( a );
6349     bSig1 = extractFloat128Frac1( b );
6350     bSig0 = extractFloat128Frac0( b );
6351     bExp = extractFloat128Exp( b );
6352     bSign = extractFloat128Sign( b );
6353     zSign = aSign ^ bSign;
6354     if ( aExp == 0x7FFF ) {
6355         if (    ( aSig0 | aSig1 )
6356              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6357             return propagateFloat128NaN(a, b, status);
6358         }
6359         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6360         return packFloat128( zSign, 0x7FFF, 0, 0 );
6361     }
6362     if ( bExp == 0x7FFF ) {
6363         if (bSig0 | bSig1) {
6364             return propagateFloat128NaN(a, b, status);
6365         }
6366         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6367  invalid:
6368             float_raise(float_flag_invalid, status);
6369             return float128_default_nan(status);
6370         }
6371         return packFloat128( zSign, 0x7FFF, 0, 0 );
6372     }
6373     if ( aExp == 0 ) {
6374         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6375         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6376     }
6377     if ( bExp == 0 ) {
6378         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6379         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6380     }
6381     zExp = aExp + bExp - 0x4000;
6382     aSig0 |= LIT64( 0x0001000000000000 );
6383     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6384     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6385     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6386     zSig2 |= ( zSig3 != 0 );
6387     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6388         shift128ExtraRightJamming(
6389             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6390         ++zExp;
6391     }
6392     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6393
6394 }
6395
6396 /*----------------------------------------------------------------------------
6397 | Returns the result of dividing the quadruple-precision floating-point value
6398 | `a' by the corresponding value `b'.  The operation is performed according to
6399 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6400 *----------------------------------------------------------------------------*/
6401
6402 float128 float128_div(float128 a, float128 b, float_status *status)
6403 {
6404     flag aSign, bSign, zSign;
6405     int32_t aExp, bExp, zExp;
6406     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6407     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6408
6409     aSig1 = extractFloat128Frac1( a );
6410     aSig0 = extractFloat128Frac0( a );
6411     aExp = extractFloat128Exp( a );
6412     aSign = extractFloat128Sign( a );
6413     bSig1 = extractFloat128Frac1( b );
6414     bSig0 = extractFloat128Frac0( b );
6415     bExp = extractFloat128Exp( b );
6416     bSign = extractFloat128Sign( b );
6417     zSign = aSign ^ bSign;
6418     if ( aExp == 0x7FFF ) {
6419         if (aSig0 | aSig1) {
6420             return propagateFloat128NaN(a, b, status);
6421         }
6422         if ( bExp == 0x7FFF ) {
6423             if (bSig0 | bSig1) {
6424                 return propagateFloat128NaN(a, b, status);
6425             }
6426             goto invalid;
6427         }
6428         return packFloat128( zSign, 0x7FFF, 0, 0 );
6429     }
6430     if ( bExp == 0x7FFF ) {
6431         if (bSig0 | bSig1) {
6432             return propagateFloat128NaN(a, b, status);
6433         }
6434         return packFloat128( zSign, 0, 0, 0 );
6435     }
6436     if ( bExp == 0 ) {
6437         if ( ( bSig0 | bSig1 ) == 0 ) {
6438             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6439  invalid:
6440                 float_raise(float_flag_invalid, status);
6441                 return float128_default_nan(status);
6442             }
6443             float_raise(float_flag_divbyzero, status);
6444             return packFloat128( zSign, 0x7FFF, 0, 0 );
6445         }
6446         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6447     }
6448     if ( aExp == 0 ) {
6449         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6450         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6451     }
6452     zExp = aExp - bExp + 0x3FFD;
6453     shortShift128Left(
6454         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6455     shortShift128Left(
6456         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6457     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6458         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6459         ++zExp;
6460     }
6461     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6462     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6463     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6464     while ( (int64_t) rem0 < 0 ) {
6465         --zSig0;
6466         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6467     }
6468     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6469     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6470         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6471         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6472         while ( (int64_t) rem1 < 0 ) {
6473             --zSig1;
6474             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6475         }
6476         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6477     }
6478     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6479     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6480
6481 }
6482
6483 /*----------------------------------------------------------------------------
6484 | Returns the remainder of the quadruple-precision floating-point value `a'
6485 | with respect to the corresponding value `b'.  The operation is performed
6486 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6487 *----------------------------------------------------------------------------*/
6488
6489 float128 float128_rem(float128 a, float128 b, float_status *status)
6490 {
6491     flag aSign, zSign;
6492     int32_t aExp, bExp, expDiff;
6493     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6494     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6495     int64_t sigMean0;
6496
6497     aSig1 = extractFloat128Frac1( a );
6498     aSig0 = extractFloat128Frac0( a );
6499     aExp = extractFloat128Exp( a );
6500     aSign = extractFloat128Sign( a );
6501     bSig1 = extractFloat128Frac1( b );
6502     bSig0 = extractFloat128Frac0( b );
6503     bExp = extractFloat128Exp( b );
6504     if ( aExp == 0x7FFF ) {
6505         if (    ( aSig0 | aSig1 )
6506              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6507             return propagateFloat128NaN(a, b, status);
6508         }
6509         goto invalid;
6510     }
6511     if ( bExp == 0x7FFF ) {
6512         if (bSig0 | bSig1) {
6513             return propagateFloat128NaN(a, b, status);
6514         }
6515         return a;
6516     }
6517     if ( bExp == 0 ) {
6518         if ( ( bSig0 | bSig1 ) == 0 ) {
6519  invalid:
6520             float_raise(float_flag_invalid, status);
6521             return float128_default_nan(status);
6522         }
6523         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6524     }
6525     if ( aExp == 0 ) {
6526         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6527         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6528     }
6529     expDiff = aExp - bExp;
6530     if ( expDiff < -1 ) return a;
6531     shortShift128Left(
6532         aSig0 | LIT64( 0x0001000000000000 ),
6533         aSig1,
6534         15 - ( expDiff < 0 ),
6535         &aSig0,
6536         &aSig1
6537     );
6538     shortShift128Left(
6539         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6540     q = le128( bSig0, bSig1, aSig0, aSig1 );
6541     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6542     expDiff -= 64;
6543     while ( 0 < expDiff ) {
6544         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6545         q = ( 4 < q ) ? q - 4 : 0;
6546         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6547         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6548         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6549         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6550         expDiff -= 61;
6551     }
6552     if ( -64 < expDiff ) {
6553         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6554         q = ( 4 < q ) ? q - 4 : 0;
6555         q >>= - expDiff;
6556         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6557         expDiff += 52;
6558         if ( expDiff < 0 ) {
6559             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6560         }
6561         else {
6562             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6563         }
6564         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6565         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6566     }
6567     else {
6568         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6569         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6570     }
6571     do {
6572         alternateASig0 = aSig0;
6573         alternateASig1 = aSig1;
6574         ++q;
6575         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6576     } while ( 0 <= (int64_t) aSig0 );
6577     add128(
6578         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6579     if (    ( sigMean0 < 0 )
6580          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6581         aSig0 = alternateASig0;
6582         aSig1 = alternateASig1;
6583     }
6584     zSign = ( (int64_t) aSig0 < 0 );
6585     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6586     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6587                                          status);
6588 }
6589
6590 /*----------------------------------------------------------------------------
6591 | Returns the square root of the quadruple-precision floating-point value `a'.
6592 | The operation is performed according to the IEC/IEEE Standard for Binary
6593 | Floating-Point Arithmetic.
6594 *----------------------------------------------------------------------------*/
6595
6596 float128 float128_sqrt(float128 a, float_status *status)
6597 {
6598     flag aSign;
6599     int32_t aExp, zExp;
6600     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6601     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6602
6603     aSig1 = extractFloat128Frac1( a );
6604     aSig0 = extractFloat128Frac0( a );
6605     aExp = extractFloat128Exp( a );
6606     aSign = extractFloat128Sign( a );
6607     if ( aExp == 0x7FFF ) {
6608         if (aSig0 | aSig1) {
6609             return propagateFloat128NaN(a, a, status);
6610         }
6611         if ( ! aSign ) return a;
6612         goto invalid;
6613     }
6614     if ( aSign ) {
6615         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6616  invalid:
6617         float_raise(float_flag_invalid, status);
6618         return float128_default_nan(status);
6619     }
6620     if ( aExp == 0 ) {
6621         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6622         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6623     }
6624     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6625     aSig0 |= LIT64( 0x0001000000000000 );
6626     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6627     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6628     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6629     doubleZSig0 = zSig0<<1;
6630     mul64To128( zSig0, zSig0, &term0, &term1 );
6631     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6632     while ( (int64_t) rem0 < 0 ) {
6633         --zSig0;
6634         doubleZSig0 -= 2;
6635         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6636     }
6637     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6638     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6639         if ( zSig1 == 0 ) zSig1 = 1;
6640         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6641         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6642         mul64To128( zSig1, zSig1, &term2, &term3 );
6643         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6644         while ( (int64_t) rem1 < 0 ) {
6645             --zSig1;
6646             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6647             term3 |= 1;
6648             term2 |= doubleZSig0;
6649             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6650         }
6651         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6652     }
6653     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6654     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6655
6656 }
6657
6658 /*----------------------------------------------------------------------------
6659 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6660 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6661 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6662 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6663 *----------------------------------------------------------------------------*/
6664
6665 int float128_eq(float128 a, float128 b, float_status *status)
6666 {
6667
6668     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6669               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6670          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6671               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6672        ) {
6673         float_raise(float_flag_invalid, status);
6674         return 0;
6675     }
6676     return
6677            ( a.low == b.low )
6678         && (    ( a.high == b.high )
6679              || (    ( a.low == 0 )
6680                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6681            );
6682
6683 }
6684
6685 /*----------------------------------------------------------------------------
6686 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6687 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6688 | exception is raised if either operand is a NaN.  The comparison is performed
6689 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6690 *----------------------------------------------------------------------------*/
6691
6692 int float128_le(float128 a, float128 b, float_status *status)
6693 {
6694     flag aSign, bSign;
6695
6696     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6697               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6698          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6699               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6700        ) {
6701         float_raise(float_flag_invalid, status);
6702         return 0;
6703     }
6704     aSign = extractFloat128Sign( a );
6705     bSign = extractFloat128Sign( b );
6706     if ( aSign != bSign ) {
6707         return
6708                aSign
6709             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6710                  == 0 );
6711     }
6712     return
6713           aSign ? le128( b.high, b.low, a.high, a.low )
6714         : le128( a.high, a.low, b.high, b.low );
6715
6716 }
6717
6718 /*----------------------------------------------------------------------------
6719 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6720 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6721 | raised if either operand is a NaN.  The comparison is performed according
6722 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6723 *----------------------------------------------------------------------------*/
6724
6725 int float128_lt(float128 a, float128 b, float_status *status)
6726 {
6727     flag aSign, bSign;
6728
6729     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6730               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6731          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6732               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6733        ) {
6734         float_raise(float_flag_invalid, status);
6735         return 0;
6736     }
6737     aSign = extractFloat128Sign( a );
6738     bSign = extractFloat128Sign( b );
6739     if ( aSign != bSign ) {
6740         return
6741                aSign
6742             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6743                  != 0 );
6744     }
6745     return
6746           aSign ? lt128( b.high, b.low, a.high, a.low )
6747         : lt128( a.high, a.low, b.high, b.low );
6748
6749 }
6750
6751 /*----------------------------------------------------------------------------
6752 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6753 | be compared, and 0 otherwise.  The invalid exception is raised if either
6754 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6755 | Standard for Binary Floating-Point Arithmetic.
6756 *----------------------------------------------------------------------------*/
6757
6758 int float128_unordered(float128 a, float128 b, float_status *status)
6759 {
6760     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6761               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6762          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6763               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6764        ) {
6765         float_raise(float_flag_invalid, status);
6766         return 1;
6767     }
6768     return 0;
6769 }
6770
6771 /*----------------------------------------------------------------------------
6772 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6773 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6774 | exception.  The comparison is performed according to the IEC/IEEE Standard
6775 | for Binary Floating-Point Arithmetic.
6776 *----------------------------------------------------------------------------*/
6777
6778 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6779 {
6780
6781     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6782               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6783          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6784               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6785        ) {
6786         if (float128_is_signaling_nan(a, status)
6787          || float128_is_signaling_nan(b, status)) {
6788             float_raise(float_flag_invalid, status);
6789         }
6790         return 0;
6791     }
6792     return
6793            ( a.low == b.low )
6794         && (    ( a.high == b.high )
6795              || (    ( a.low == 0 )
6796                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6797            );
6798
6799 }
6800
6801 /*----------------------------------------------------------------------------
6802 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6803 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6804 | cause an exception.  Otherwise, the comparison is performed according to the
6805 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6806 *----------------------------------------------------------------------------*/
6807
6808 int float128_le_quiet(float128 a, float128 b, float_status *status)
6809 {
6810     flag aSign, bSign;
6811
6812     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6813               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6814          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6815               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6816        ) {
6817         if (float128_is_signaling_nan(a, status)
6818          || float128_is_signaling_nan(b, status)) {
6819             float_raise(float_flag_invalid, status);
6820         }
6821         return 0;
6822     }
6823     aSign = extractFloat128Sign( a );
6824     bSign = extractFloat128Sign( b );
6825     if ( aSign != bSign ) {
6826         return
6827                aSign
6828             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6829                  == 0 );
6830     }
6831     return
6832           aSign ? le128( b.high, b.low, a.high, a.low )
6833         : le128( a.high, a.low, b.high, b.low );
6834
6835 }
6836
6837 /*----------------------------------------------------------------------------
6838 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6839 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6840 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6841 | Standard for Binary Floating-Point Arithmetic.
6842 *----------------------------------------------------------------------------*/
6843
6844 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6845 {
6846     flag aSign, bSign;
6847
6848     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6849               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6850          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6851               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6852        ) {
6853         if (float128_is_signaling_nan(a, status)
6854          || float128_is_signaling_nan(b, status)) {
6855             float_raise(float_flag_invalid, status);
6856         }
6857         return 0;
6858     }
6859     aSign = extractFloat128Sign( a );
6860     bSign = extractFloat128Sign( b );
6861     if ( aSign != bSign ) {
6862         return
6863                aSign
6864             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6865                  != 0 );
6866     }
6867     return
6868           aSign ? lt128( b.high, b.low, a.high, a.low )
6869         : lt128( a.high, a.low, b.high, b.low );
6870
6871 }
6872
6873 /*----------------------------------------------------------------------------
6874 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6875 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6876 | comparison is performed according to the IEC/IEEE Standard for Binary
6877 | Floating-Point Arithmetic.
6878 *----------------------------------------------------------------------------*/
6879
6880 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6881 {
6882     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6883               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6884          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6885               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6886        ) {
6887         if (float128_is_signaling_nan(a, status)
6888          || float128_is_signaling_nan(b, status)) {
6889             float_raise(float_flag_invalid, status);
6890         }
6891         return 1;
6892     }
6893     return 0;
6894 }
6895
6896 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6897                                             int is_quiet, float_status *status)
6898 {
6899     flag aSign, bSign;
6900
6901     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6902         float_raise(float_flag_invalid, status);
6903         return float_relation_unordered;
6904     }
6905     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6906           ( extractFloatx80Frac( a )<<1 ) ) ||
6907         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6908           ( extractFloatx80Frac( b )<<1 ) )) {
6909         if (!is_quiet ||
6910             floatx80_is_signaling_nan(a, status) ||
6911             floatx80_is_signaling_nan(b, status)) {
6912             float_raise(float_flag_invalid, status);
6913         }
6914         return float_relation_unordered;
6915     }
6916     aSign = extractFloatx80Sign( a );
6917     bSign = extractFloatx80Sign( b );
6918     if ( aSign != bSign ) {
6919
6920         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6921              ( ( a.low | b.low ) == 0 ) ) {
6922             /* zero case */
6923             return float_relation_equal;
6924         } else {
6925             return 1 - (2 * aSign);
6926         }
6927     } else {
6928         if (a.low == b.low && a.high == b.high) {
6929             return float_relation_equal;
6930         } else {
6931             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6932         }
6933     }
6934 }
6935
6936 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6937 {
6938     return floatx80_compare_internal(a, b, 0, status);
6939 }
6940
6941 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6942 {
6943     return floatx80_compare_internal(a, b, 1, status);
6944 }
6945
6946 static inline int float128_compare_internal(float128 a, float128 b,
6947                                             int is_quiet, float_status *status)
6948 {
6949     flag aSign, bSign;
6950
6951     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6952           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6953         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6954           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6955         if (!is_quiet ||
6956             float128_is_signaling_nan(a, status) ||
6957             float128_is_signaling_nan(b, status)) {
6958             float_raise(float_flag_invalid, status);
6959         }
6960         return float_relation_unordered;
6961     }
6962     aSign = extractFloat128Sign( a );
6963     bSign = extractFloat128Sign( b );
6964     if ( aSign != bSign ) {
6965         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6966             /* zero case */
6967             return float_relation_equal;
6968         } else {
6969             return 1 - (2 * aSign);
6970         }
6971     } else {
6972         if (a.low == b.low && a.high == b.high) {
6973             return float_relation_equal;
6974         } else {
6975             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6976         }
6977     }
6978 }
6979
6980 int float128_compare(float128 a, float128 b, float_status *status)
6981 {
6982     return float128_compare_internal(a, b, 0, status);
6983 }
6984
6985 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6986 {
6987     return float128_compare_internal(a, b, 1, status);
6988 }
6989
6990 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6991 {
6992     flag aSign;
6993     int32_t aExp;
6994     uint64_t aSig;
6995
6996     if (floatx80_invalid_encoding(a)) {
6997         float_raise(float_flag_invalid, status);
6998         return floatx80_default_nan(status);
6999     }
7000     aSig = extractFloatx80Frac( a );
7001     aExp = extractFloatx80Exp( a );
7002     aSign = extractFloatx80Sign( a );
7003
7004     if ( aExp == 0x7FFF ) {
7005         if ( aSig<<1 ) {
7006             return propagateFloatx80NaN(a, a, status);
7007         }
7008         return a;
7009     }
7010
7011     if (aExp == 0) {
7012         if (aSig == 0) {
7013             return a;
7014         }
7015         aExp++;
7016     }
7017
7018     if (n > 0x10000) {
7019         n = 0x10000;
7020     } else if (n < -0x10000) {
7021         n = -0x10000;
7022     }
7023
7024     aExp += n;
7025     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7026                                          aSign, aExp, aSig, 0, status);
7027 }
7028
7029 float128 float128_scalbn(float128 a, int n, float_status *status)
7030 {
7031     flag aSign;
7032     int32_t aExp;
7033     uint64_t aSig0, aSig1;
7034
7035     aSig1 = extractFloat128Frac1( a );
7036     aSig0 = extractFloat128Frac0( a );
7037     aExp = extractFloat128Exp( a );
7038     aSign = extractFloat128Sign( a );
7039     if ( aExp == 0x7FFF ) {
7040         if ( aSig0 | aSig1 ) {
7041             return propagateFloat128NaN(a, a, status);
7042         }
7043         return a;
7044     }
7045     if (aExp != 0) {
7046         aSig0 |= LIT64( 0x0001000000000000 );
7047     } else if (aSig0 == 0 && aSig1 == 0) {
7048         return a;
7049     } else {
7050         aExp++;
7051     }
7052
7053     if (n > 0x10000) {
7054         n = 0x10000;
7055     } else if (n < -0x10000) {
7056         n = -0x10000;
7057     }
7058
7059     aExp += n - 1;
7060     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7061                                          , status);
7062
7063 }