fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include "qemu/bitops.h"
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90
  91 /*----------------------------------------------------------------------------
  92 | Primitive arithmetic functions, including multi-word arithmetic, and
  93 | division and square root approximations.  (Can be specialized to target if
  94 | desired.)
  95 *----------------------------------------------------------------------------*/
  96 #include "softfloat-macros.h"
  97
  98 /*----------------------------------------------------------------------------
  99 | Functions and definitions to determine:  (1) whether tininess for underflow
 100 | is detected before or after rounding by default, (2) what (if anything)
 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 103 | are propagated from function inputs to output.  These details are target-
 104 | specific.
 105 *----------------------------------------------------------------------------*/
 106 #include "softfloat-specialize.h"
 107
 108 /*----------------------------------------------------------------------------
 109 | Returns the fraction bits of the half-precision floating-point value `a'.
 110 *----------------------------------------------------------------------------*/
 111
 112 static inline uint32_t extractFloat16Frac(float16 a)
 113 {
 114     return float16_val(a) & 0x3ff;
 115 }
 116
 117 /*----------------------------------------------------------------------------
 118 | Returns the exponent bits of the half-precision floating-point value `a'.
 119 *----------------------------------------------------------------------------*/
 120
 121 static inline int extractFloat16Exp(float16 a)
 122 {
 123     return (float16_val(a) >> 10) & 0x1f;
 124 }
 125
 126 /*----------------------------------------------------------------------------
 127 | Returns the sign bit of the single-precision floating-point value `a'.
 128 *----------------------------------------------------------------------------*/
 129
 130 static inline flag extractFloat16Sign(float16 a)
 131 {
 132     return float16_val(a)>>15;
 133 }
 134
 135 /*----------------------------------------------------------------------------
 136 | Returns the fraction bits of the single-precision floating-point value `a'.
 137 *----------------------------------------------------------------------------*/
 138
 139 static inline uint32_t extractFloat32Frac(float32 a)
 140 {
 141     return float32_val(a) & 0x007FFFFF;
 142 }
 143
 144 /*----------------------------------------------------------------------------
 145 | Returns the exponent bits of the single-precision floating-point value `a'.
 146 *----------------------------------------------------------------------------*/
 147
 148 static inline int extractFloat32Exp(float32 a)
 149 {
 150     return (float32_val(a) >> 23) & 0xFF;
 151 }
 152
 153 /*----------------------------------------------------------------------------
 154 | Returns the sign bit of the single-precision floating-point value `a'.
 155 *----------------------------------------------------------------------------*/
 156
 157 static inline flag extractFloat32Sign(float32 a)
 158 {
 159     return float32_val(a) >> 31;
 160 }
 161
 162 /*----------------------------------------------------------------------------
 163 | Returns the fraction bits of the double-precision floating-point value `a'.
 164 *----------------------------------------------------------------------------*/
 165
 166 static inline uint64_t extractFloat64Frac(float64 a)
 167 {
 168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 169 }
 170
 171 /*----------------------------------------------------------------------------
 172 | Returns the exponent bits of the double-precision floating-point value `a'.
 173 *----------------------------------------------------------------------------*/
 174
 175 static inline int extractFloat64Exp(float64 a)
 176 {
 177     return (float64_val(a) >> 52) & 0x7FF;
 178 }
 179
 180 /*----------------------------------------------------------------------------
 181 | Returns the sign bit of the double-precision floating-point value `a'.
 182 *----------------------------------------------------------------------------*/
 183
 184 static inline flag extractFloat64Sign(float64 a)
 185 {
 186     return float64_val(a) >> 63;
 187 }
 188
 189 /*
 190  * Classify a floating point number. Everything above float_class_qnan
 191  * is a NaN so cls >= float_class_qnan is any NaN.
 192  */
 193
 194 typedef enum __attribute__ ((__packed__)) {
 195     float_class_unclassified,
 196     float_class_zero,
 197     float_class_normal,
 198     float_class_inf,
 199     float_class_qnan,  /* all NaNs from here */
 200     float_class_snan,
 201     float_class_dnan,
 202     float_class_msnan, /* maybe silenced */
 203 } FloatClass;
 204
 205 /*
 206  * Structure holding all of the decomposed parts of a float. The
 207  * exponent is unbiased and the fraction is normalized. All
 208  * calculations are done with a 64 bit fraction and then rounded as
 209  * appropriate for the final format.
 210  *
 211  * Thanks to the packed FloatClass a decent compiler should be able to
 212  * fit the whole structure into registers and avoid using the stack
 213  * for parameter passing.
 214  */
 215
 216 typedef struct {
 217     uint64_t frac;
 218     int32_t  exp;
 219     FloatClass cls;
 220     bool sign;
 221 } FloatParts;
 222
 223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 226
 227 /* Structure holding all of the relevant parameters for a format.
 228  *   exp_size: the size of the exponent field
 229  *   exp_bias: the offset applied to the exponent field
 230  *   exp_max: the maximum normalised exponent
 231  *   frac_size: the size of the fraction field
 232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 233  * The following are computed based the size of fraction
 234  *   frac_lsb: least significant bit of fraction
 235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
 236  *   round_mask/roundeven_mask: masks used for rounding
 237  */
 238 typedef struct {
 239     int exp_size;
 240     int exp_bias;
 241     int exp_max;
 242     int frac_size;
 243     int frac_shift;
 244     uint64_t frac_lsb;
 245     uint64_t frac_lsbm1;
 246     uint64_t round_mask;
 247     uint64_t roundeven_mask;
 248 } FloatFmt;
 249
 250 /* Expand fields based on the size of exponent and fraction */
 251 #define FLOAT_PARAMS(E, F)                                           \
 252     .exp_size       = E,                                             \
 253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 254     .exp_max        = (1 << E) - 1,                                  \
 255     .frac_size      = F,                                             \
 256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 261
 262 static const FloatFmt float16_params = {
 263     FLOAT_PARAMS(5, 10)
 264 };
 265
 266 static const FloatFmt float32_params = {
 267     FLOAT_PARAMS(8, 23)
 268 };
 269
 270 static const FloatFmt float64_params = {
 271     FLOAT_PARAMS(11, 52)
 272 };
 273
 274 /* Unpack a float to parts, but do not canonicalize.  */
 275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 276 {
 277     const int sign_pos = fmt.frac_size + fmt.exp_size;
 278
 279     return (FloatParts) {
 280         .cls = float_class_unclassified,
 281         .sign = extract64(raw, sign_pos, 1),
 282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 283         .frac = extract64(raw, 0, fmt.frac_size),
 284     };
 285 }
 286
 287 static inline FloatParts float16_unpack_raw(float16 f)
 288 {
 289     return unpack_raw(float16_params, f);
 290 }
 291
 292 static inline FloatParts float32_unpack_raw(float32 f)
 293 {
 294     return unpack_raw(float32_params, f);
 295 }
 296
 297 static inline FloatParts float64_unpack_raw(float64 f)
 298 {
 299     return unpack_raw(float64_params, f);
 300 }
 301
 302 /* Pack a float from parts, but do not canonicalize.  */
 303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 304 {
 305     const int sign_pos = fmt.frac_size + fmt.exp_size;
 306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 307     return deposit64(ret, sign_pos, 1, p.sign);
 308 }
 309
 310 static inline float16 float16_pack_raw(FloatParts p)
 311 {
 312     return make_float16(pack_raw(float16_params, p));
 313 }
 314
 315 static inline float32 float32_pack_raw(FloatParts p)
 316 {
 317     return make_float32(pack_raw(float32_params, p));
 318 }
 319
 320 static inline float64 float64_pack_raw(FloatParts p)
 321 {
 322     return make_float64(pack_raw(float64_params, p));
 323 }
 324
 325 /* Canonicalize EXP and FRAC, setting CLS.  */
 326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
 327                                float_status *status)
 328 {
 329     if (part.exp == parm->exp_max) {
 330         if (part.frac == 0) {
 331             part.cls = float_class_inf;
 332         } else {
 333 #ifdef NO_SIGNALING_NANS
 334             part.cls = float_class_qnan;
 335 #else
 336             int64_t msb = part.frac << (parm->frac_shift + 2);
 337             if ((msb < 0) == status->snan_bit_is_one) {
 338                 part.cls = float_class_snan;
 339             } else {
 340                 part.cls = float_class_qnan;
 341             }
 342 #endif
 343         }
 344     } else if (part.exp == 0) {
 345         if (likely(part.frac == 0)) {
 346             part.cls = float_class_zero;
 347         } else if (status->flush_inputs_to_zero) {
 348             float_raise(float_flag_input_denormal, status);
 349             part.cls = float_class_zero;
 350             part.frac = 0;
 351         } else {
 352             int shift = clz64(part.frac) - 1;
 353             part.cls = float_class_normal;
 354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 355             part.frac <<= shift;
 356         }
 357     } else {
 358         part.cls = float_class_normal;
 359         part.exp -= parm->exp_bias;
 360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 361     }
 362     return part;
 363 }
 364
 365 /* Round and uncanonicalize a floating-point number by parts. There
 366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 367  * fraction; these bits will be removed. The exponent will be biased
 368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 369  */
 370
 371 static FloatParts round_canonical(FloatParts p, float_status *s,
 372                                   const FloatFmt *parm)
 373 {
 374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 375     const uint64_t round_mask = parm->round_mask;
 376     const uint64_t roundeven_mask = parm->roundeven_mask;
 377     const int exp_max = parm->exp_max;
 378     const int frac_shift = parm->frac_shift;
 379     uint64_t frac, inc;
 380     int exp, flags = 0;
 381     bool overflow_norm;
 382
 383     frac = p.frac;
 384     exp = p.exp;
 385
 386     switch (p.cls) {
 387     case float_class_normal:
 388         switch (s->float_rounding_mode) {
 389         case float_round_nearest_even:
 390             overflow_norm = false;
 391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 392             break;
 393         case float_round_ties_away:
 394             overflow_norm = false;
 395             inc = frac_lsbm1;
 396             break;
 397         case float_round_to_zero:
 398             overflow_norm = true;
 399             inc = 0;
 400             break;
 401         case float_round_up:
 402             inc = p.sign ? 0 : round_mask;
 403             overflow_norm = p.sign;
 404             break;
 405         case float_round_down:
 406             inc = p.sign ? round_mask : 0;
 407             overflow_norm = !p.sign;
 408             break;
 409         default:
 410             g_assert_not_reached();
 411         }
 412
 413         exp += parm->exp_bias;
 414         if (likely(exp > 0)) {
 415             if (frac & round_mask) {
 416                 flags |= float_flag_inexact;
 417                 frac += inc;
 418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
 419                     frac >>= 1;
 420                     exp++;
 421                 }
 422             }
 423             frac >>= frac_shift;
 424
 425             if (unlikely(exp >= exp_max)) {
 426                 flags |= float_flag_overflow | float_flag_inexact;
 427                 if (overflow_norm) {
 428                     exp = exp_max - 1;
 429                     frac = -1;
 430                 } else {
 431                     p.cls = float_class_inf;
 432                     goto do_inf;
 433                 }
 434             }
 435         } else if (s->flush_to_zero) {
 436             flags |= float_flag_output_denormal;
 437             p.cls = float_class_zero;
 438             goto do_zero;
 439         } else {
 440             bool is_tiny = (s->float_detect_tininess
 441                             == float_tininess_before_rounding)
 442                         || (exp < 0)
 443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 444
 445             shift64RightJamming(frac, 1 - exp, &frac);
 446             if (frac & round_mask) {
 447                 /* Need to recompute round-to-even.  */
 448                 if (s->float_rounding_mode == float_round_nearest_even) {
 449                     inc = ((frac & roundeven_mask) != frac_lsbm1
 450                            ? frac_lsbm1 : 0);
 451                 }
 452                 flags |= float_flag_inexact;
 453                 frac += inc;
 454             }
 455
 456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 457             frac >>= frac_shift;
 458
 459             if (is_tiny && (flags & float_flag_inexact)) {
 460                 flags |= float_flag_underflow;
 461             }
 462             if (exp == 0 && frac == 0) {
 463                 p.cls = float_class_zero;
 464             }
 465         }
 466         break;
 467
 468     case float_class_zero:
 469     do_zero:
 470         exp = 0;
 471         frac = 0;
 472         break;
 473
 474     case float_class_inf:
 475     do_inf:
 476         exp = exp_max;
 477         frac = 0;
 478         break;
 479
 480     case float_class_qnan:
 481     case float_class_snan:
 482         exp = exp_max;
 483         break;
 484
 485     default:
 486         g_assert_not_reached();
 487     }
 488
 489     float_raise(flags, s);
 490     p.exp = exp;
 491     p.frac = frac;
 492     return p;
 493 }
 494
 495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 496 {
 497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
 498 }
 499
 500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 501 {
 502     switch (p.cls) {
 503     case float_class_dnan:
 504         return float16_default_nan(s);
 505     case float_class_msnan:
 506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
 507     default:
 508         p = round_canonical(p, s, &float16_params);
 509         return float16_pack_raw(p);
 510     }
 511 }
 512
 513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 514 {
 515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
 516 }
 517
 518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 519 {
 520     switch (p.cls) {
 521     case float_class_dnan:
 522         return float32_default_nan(s);
 523     case float_class_msnan:
 524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
 525     default:
 526         p = round_canonical(p, s, &float32_params);
 527         return float32_pack_raw(p);
 528     }
 529 }
 530
 531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 532 {
 533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
 534 }
 535
 536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 537 {
 538     switch (p.cls) {
 539     case float_class_dnan:
 540         return float64_default_nan(s);
 541     case float_class_msnan:
 542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
 543     default:
 544         p = round_canonical(p, s, &float64_params);
 545         return float64_pack_raw(p);
 546     }
 547 }
 548
 549 /* Simple helpers for checking if what NaN we have */
 550 static bool is_nan(FloatClass c)
 551 {
 552     return unlikely(c >= float_class_qnan);
 553 }
 554 static bool is_snan(FloatClass c)
 555 {
 556     return c == float_class_snan;
 557 }
 558 static bool is_qnan(FloatClass c)
 559 {
 560     return c == float_class_qnan;
 561 }
 562
 563 static FloatParts return_nan(FloatParts a, float_status *s)
 564 {
 565     switch (a.cls) {
 566     case float_class_snan:
 567         s->float_exception_flags |= float_flag_invalid;
 568         a.cls = float_class_msnan;
 569         /* fall through */
 570     case float_class_qnan:
 571         if (s->default_nan_mode) {
 572             a.cls = float_class_dnan;
 573         }
 574         break;
 575
 576     default:
 577         g_assert_not_reached();
 578     }
 579     return a;
 580 }
 581
 582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 583 {
 584     if (is_snan(a.cls) || is_snan(b.cls)) {
 585         s->float_exception_flags |= float_flag_invalid;
 586     }
 587
 588     if (s->default_nan_mode) {
 589         a.cls = float_class_dnan;
 590     } else {
 591         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
 592                     is_qnan(b.cls), is_snan(b.cls),
 593                     a.frac > b.frac ||
 594                     (a.frac == b.frac && a.sign < b.sign))) {
 595             a = b;
 596         }
 597         a.cls = float_class_msnan;
 598     }
 599     return a;
 600 }
 601
 602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 603                                   bool inf_zero, float_status *s)
 604 {
 605     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 606         s->float_exception_flags |= float_flag_invalid;
 607     }
 608
 609     if (s->default_nan_mode) {
 610         a.cls = float_class_dnan;
 611     } else {
 612         switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
 613                               is_qnan(b.cls), is_snan(b.cls),
 614                               is_qnan(c.cls), is_snan(c.cls),
 615                               inf_zero, s)) {
 616         case 0:
 617             break;
 618         case 1:
 619             a = b;
 620             break;
 621         case 2:
 622             a = c;
 623             break;
 624         case 3:
 625             a.cls = float_class_dnan;
 626             return a;
 627         default:
 628             g_assert_not_reached();
 629         }
 630
 631         a.cls = float_class_msnan;
 632     }
 633     return a;
 634 }
 635
 636 /*
 637  * Returns the result of adding or subtracting the values of the
 638  * floating-point values `a' and `b'. The operation is performed
 639  * according to the IEC/IEEE Standard for Binary Floating-Point
 640  * Arithmetic.
 641  */
 642
 643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 644                                 float_status *s)
 645 {
 646     bool a_sign = a.sign;
 647     bool b_sign = b.sign ^ subtract;
 648
 649     if (a_sign != b_sign) {
 650         /* Subtraction */
 651
 652         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 653             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 654                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 655                 a.frac = a.frac - b.frac;
 656             } else {
 657                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 658                 a.frac = b.frac - a.frac;
 659                 a.exp = b.exp;
 660                 a_sign ^= 1;
 661             }
 662
 663             if (a.frac == 0) {
 664                 a.cls = float_class_zero;
 665                 a.sign = s->float_rounding_mode == float_round_down;
 666             } else {
 667                 int shift = clz64(a.frac) - 1;
 668                 a.frac = a.frac << shift;
 669                 a.exp = a.exp - shift;
 670                 a.sign = a_sign;
 671             }
 672             return a;
 673         }
 674         if (is_nan(a.cls) || is_nan(b.cls)) {
 675             return pick_nan(a, b, s);
 676         }
 677         if (a.cls == float_class_inf) {
 678             if (b.cls == float_class_inf) {
 679                 float_raise(float_flag_invalid, s);
 680                 a.cls = float_class_dnan;
 681             }
 682             return a;
 683         }
 684         if (a.cls == float_class_zero && b.cls == float_class_zero) {
 685             a.sign = s->float_rounding_mode == float_round_down;
 686             return a;
 687         }
 688         if (a.cls == float_class_zero || b.cls == float_class_inf) {
 689             b.sign = a_sign ^ 1;
 690             return b;
 691         }
 692         if (b.cls == float_class_zero) {
 693             return a;
 694         }
 695     } else {
 696         /* Addition */
 697         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 698             if (a.exp > b.exp) {
 699                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 700             } else if (a.exp < b.exp) {
 701                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 702                 a.exp = b.exp;
 703             }
 704             a.frac += b.frac;
 705             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
 706                 a.frac >>= 1;
 707                 a.exp += 1;
 708             }
 709             return a;
 710         }
 711         if (is_nan(a.cls) || is_nan(b.cls)) {
 712             return pick_nan(a, b, s);
 713         }
 714         if (a.cls == float_class_inf || b.cls == float_class_zero) {
 715             return a;
 716         }
 717         if (b.cls == float_class_inf || a.cls == float_class_zero) {
 718             b.sign = b_sign;
 719             return b;
 720         }
 721     }
 722     g_assert_not_reached();
 723 }
 724
 725 /*
 726  * Returns the result of adding or subtracting the floating-point
 727  * values `a' and `b'. The operation is performed according to the
 728  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 729  */
 730
 731 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
 732                                               float_status *status)
 733 {
 734     FloatParts pa = float16_unpack_canonical(a, status);
 735     FloatParts pb = float16_unpack_canonical(b, status);
 736     FloatParts pr = addsub_floats(pa, pb, false, status);
 737
 738     return float16_round_pack_canonical(pr, status);
 739 }
 740
 741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
 742                                              float_status *status)
 743 {
 744     FloatParts pa = float32_unpack_canonical(a, status);
 745     FloatParts pb = float32_unpack_canonical(b, status);
 746     FloatParts pr = addsub_floats(pa, pb, false, status);
 747
 748     return float32_round_pack_canonical(pr, status);
 749 }
 750
 751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
 752                                              float_status *status)
 753 {
 754     FloatParts pa = float64_unpack_canonical(a, status);
 755     FloatParts pb = float64_unpack_canonical(b, status);
 756     FloatParts pr = addsub_floats(pa, pb, false, status);
 757
 758     return float64_round_pack_canonical(pr, status);
 759 }
 760
 761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
 762                                              float_status *status)
 763 {
 764     FloatParts pa = float16_unpack_canonical(a, status);
 765     FloatParts pb = float16_unpack_canonical(b, status);
 766     FloatParts pr = addsub_floats(pa, pb, true, status);
 767
 768     return float16_round_pack_canonical(pr, status);
 769 }
 770
 771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
 772                                              float_status *status)
 773 {
 774     FloatParts pa = float32_unpack_canonical(a, status);
 775     FloatParts pb = float32_unpack_canonical(b, status);
 776     FloatParts pr = addsub_floats(pa, pb, true, status);
 777
 778     return float32_round_pack_canonical(pr, status);
 779 }
 780
 781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
 782                                              float_status *status)
 783 {
 784     FloatParts pa = float64_unpack_canonical(a, status);
 785     FloatParts pb = float64_unpack_canonical(b, status);
 786     FloatParts pr = addsub_floats(pa, pb, true, status);
 787
 788     return float64_round_pack_canonical(pr, status);
 789 }
 790
 791 /*
 792  * Returns the result of multiplying the floating-point values `a' and
 793  * `b'. The operation is performed according to the IEC/IEEE Standard
 794  * for Binary Floating-Point Arithmetic.
 795  */
 796
 797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
 798 {
 799     bool sign = a.sign ^ b.sign;
 800
 801     if (a.cls == float_class_normal && b.cls == float_class_normal) {
 802         uint64_t hi, lo;
 803         int exp = a.exp + b.exp;
 804
 805         mul64To128(a.frac, b.frac, &hi, &lo);
 806         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 807         if (lo & DECOMPOSED_OVERFLOW_BIT) {
 808             shift64RightJamming(lo, 1, &lo);
 809             exp += 1;
 810         }
 811
 812         /* Re-use a */
 813         a.exp = exp;
 814         a.sign = sign;
 815         a.frac = lo;
 816         return a;
 817     }
 818     /* handle all the NaN cases */
 819     if (is_nan(a.cls) || is_nan(b.cls)) {
 820         return pick_nan(a, b, s);
 821     }
 822     /* Inf * Zero == NaN */
 823     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
 824         (a.cls == float_class_zero && b.cls == float_class_inf)) {
 825         s->float_exception_flags |= float_flag_invalid;
 826         a.cls = float_class_dnan;
 827         a.sign = sign;
 828         return a;
 829     }
 830     /* Multiply by 0 or Inf */
 831     if (a.cls == float_class_inf || a.cls == float_class_zero) {
 832         a.sign = sign;
 833         return a;
 834     }
 835     if (b.cls == float_class_inf || b.cls == float_class_zero) {
 836         b.sign = sign;
 837         return b;
 838     }
 839     g_assert_not_reached();
 840 }
 841
 842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
 843                                              float_status *status)
 844 {
 845     FloatParts pa = float16_unpack_canonical(a, status);
 846     FloatParts pb = float16_unpack_canonical(b, status);
 847     FloatParts pr = mul_floats(pa, pb, status);
 848
 849     return float16_round_pack_canonical(pr, status);
 850 }
 851
 852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
 853                                              float_status *status)
 854 {
 855     FloatParts pa = float32_unpack_canonical(a, status);
 856     FloatParts pb = float32_unpack_canonical(b, status);
 857     FloatParts pr = mul_floats(pa, pb, status);
 858
 859     return float32_round_pack_canonical(pr, status);
 860 }
 861
 862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
 863                                              float_status *status)
 864 {
 865     FloatParts pa = float64_unpack_canonical(a, status);
 866     FloatParts pb = float64_unpack_canonical(b, status);
 867     FloatParts pr = mul_floats(pa, pb, status);
 868
 869     return float64_round_pack_canonical(pr, status);
 870 }
 871
 872 /*
 873  * Returns the result of multiplying the floating-point values `a' and
 874  * `b' then adding 'c', with no intermediate rounding step after the
 875  * multiplication. The operation is performed according to the
 876  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 877  * The flags argument allows the caller to select negation of the
 878  * addend, the intermediate product, or the final result. (The
 879  * difference between this and having the caller do a separate
 880  * negation is that negating externally will flip the sign bit on
 881  * NaNs.)
 882  */
 883
 884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
 885                                 int flags, float_status *s)
 886 {
 887     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
 888                     ((1 << float_class_inf) | (1 << float_class_zero));
 889     bool p_sign;
 890     bool sign_flip = flags & float_muladd_negate_result;
 891     FloatClass p_class;
 892     uint64_t hi, lo;
 893     int p_exp;
 894
 895     /* It is implementation-defined whether the cases of (0,inf,qnan)
 896      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
 897      * they return if they do), so we have to hand this information
 898      * off to the target-specific pick-a-NaN routine.
 899      */
 900     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
 901         return pick_nan_muladd(a, b, c, inf_zero, s);
 902     }
 903
 904     if (inf_zero) {
 905         s->float_exception_flags |= float_flag_invalid;
 906         a.cls = float_class_dnan;
 907         return a;
 908     }
 909
 910     if (flags & float_muladd_negate_c) {
 911         c.sign ^= 1;
 912     }
 913
 914     p_sign = a.sign ^ b.sign;
 915
 916     if (flags & float_muladd_negate_product) {
 917         p_sign ^= 1;
 918     }
 919
 920     if (a.cls == float_class_inf || b.cls == float_class_inf) {
 921         p_class = float_class_inf;
 922     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
 923         p_class = float_class_zero;
 924     } else {
 925         p_class = float_class_normal;
 926     }
 927
 928     if (c.cls == float_class_inf) {
 929         if (p_class == float_class_inf && p_sign != c.sign) {
 930             s->float_exception_flags |= float_flag_invalid;
 931             a.cls = float_class_dnan;
 932         } else {
 933             a.cls = float_class_inf;
 934             a.sign = c.sign ^ sign_flip;
 935         }
 936         return a;
 937     }
 938
 939     if (p_class == float_class_inf) {
 940         a.cls = float_class_inf;
 941         a.sign = p_sign ^ sign_flip;
 942         return a;
 943     }
 944
 945     if (p_class == float_class_zero) {
 946         if (c.cls == float_class_zero) {
 947             if (p_sign != c.sign) {
 948                 p_sign = s->float_rounding_mode == float_round_down;
 949             }
 950             c.sign = p_sign;
 951         } else if (flags & float_muladd_halve_result) {
 952             c.exp -= 1;
 953         }
 954         c.sign ^= sign_flip;
 955         return c;
 956     }
 957
 958     /* a & b should be normals now... */
 959     assert(a.cls == float_class_normal &&
 960            b.cls == float_class_normal);
 961
 962     p_exp = a.exp + b.exp;
 963
 964     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
 965      * result.
 966      */
 967     mul64To128(a.frac, b.frac, &hi, &lo);
 968     /* binary point now at bit 124 */
 969
 970     /* check for overflow */
 971     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
 972         shift128RightJamming(hi, lo, 1, &hi, &lo);
 973         p_exp += 1;
 974     }
 975
 976     /* + add/sub */
 977     if (c.cls == float_class_zero) {
 978         /* move binary point back to 62 */
 979         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 980     } else {
 981         int exp_diff = p_exp - c.exp;
 982         if (p_sign == c.sign) {
 983             /* Addition */
 984             if (exp_diff <= 0) {
 985                 shift128RightJamming(hi, lo,
 986                                      DECOMPOSED_BINARY_POINT - exp_diff,
 987                                      &hi, &lo);
 988                 lo += c.frac;
 989                 p_exp = c.exp;
 990             } else {
 991                 uint64_t c_hi, c_lo;
 992                 /* shift c to the same binary point as the product (124) */
 993                 c_hi = c.frac >> 2;
 994                 c_lo = 0;
 995                 shift128RightJamming(c_hi, c_lo,
 996                                      exp_diff,
 997                                      &c_hi, &c_lo);
 998                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
 999                 /* move binary point back to 62 */
1000                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001             }
1002
1003             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004                 shift64RightJamming(lo, 1, &lo);
1005                 p_exp += 1;
1006             }
1007
1008         } else {
1009             /* Subtraction */
1010             uint64_t c_hi, c_lo;
1011             /* make C binary point match product at bit 124 */
1012             c_hi = c.frac >> 2;
1013             c_lo = 0;
1014
1015             if (exp_diff <= 0) {
1016                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017                 if (exp_diff == 0
1018                     &&
1019                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021                 } else {
1022                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023                     p_sign ^= 1;
1024                     p_exp = c.exp;
1025                 }
1026             } else {
1027                 shift128RightJamming(c_hi, c_lo,
1028                                      exp_diff,
1029                                      &c_hi, &c_lo);
1030                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031             }
1032
1033             if (hi == 0 && lo == 0) {
1034                 a.cls = float_class_zero;
1035                 a.sign = s->float_rounding_mode == float_round_down;
1036                 a.sign ^= sign_flip;
1037                 return a;
1038             } else {
1039                 int shift;
1040                 if (hi != 0) {
1041                     shift = clz64(hi);
1042                 } else {
1043                     shift = clz64(lo) + 64;
1044                 }
1045                 /* Normalizing to a binary point of 124 is the
1046                    correct adjust for the exponent.  However since we're
1047                    shifting, we might as well put the binary point back
1048                    at 62 where we really want it.  Therefore shift as
1049                    if we're leaving 1 bit at the top of the word, but
1050                    adjust the exponent as if we're leaving 3 bits.  */
1051                 shift -= 1;
1052                 if (shift >= 64) {
1053                     lo = lo << (shift - 64);
1054                 } else {
1055                     hi = (hi << shift) | (lo >> (64 - shift));
1056                     lo = hi | ((lo << shift) != 0);
1057                 }
1058                 p_exp -= shift - 2;
1059             }
1060         }
1061     }
1062
1063     if (flags & float_muladd_halve_result) {
1064         p_exp -= 1;
1065     }
1066
1067     /* finally prepare our result */
1068     a.cls = float_class_normal;
1069     a.sign = p_sign ^ sign_flip;
1070     a.exp = p_exp;
1071     a.frac = lo;
1072
1073     return a;
1074 }
1075
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077                                                 int flags, float_status *status)
1078 {
1079     FloatParts pa = float16_unpack_canonical(a, status);
1080     FloatParts pb = float16_unpack_canonical(b, status);
1081     FloatParts pc = float16_unpack_canonical(c, status);
1082     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083
1084     return float16_round_pack_canonical(pr, status);
1085 }
1086
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088                                                 int flags, float_status *status)
1089 {
1090     FloatParts pa = float32_unpack_canonical(a, status);
1091     FloatParts pb = float32_unpack_canonical(b, status);
1092     FloatParts pc = float32_unpack_canonical(c, status);
1093     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094
1095     return float32_round_pack_canonical(pr, status);
1096 }
1097
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099                                                 int flags, float_status *status)
1100 {
1101     FloatParts pa = float64_unpack_canonical(a, status);
1102     FloatParts pb = float64_unpack_canonical(b, status);
1103     FloatParts pc = float64_unpack_canonical(c, status);
1104     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105
1106     return float64_round_pack_canonical(pr, status);
1107 }
1108
1109 /*
1110  * Returns the result of dividing the floating-point value `a' by the
1111  * corresponding value `b'. The operation is performed according to
1112  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113  */
1114
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116 {
1117     bool sign = a.sign ^ b.sign;
1118
1119     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120         uint64_t temp_lo, temp_hi;
1121         int exp = a.exp - b.exp;
1122         if (a.frac < b.frac) {
1123             exp -= 1;
1124             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125                               &temp_hi, &temp_lo);
1126         } else {
1127             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128                               &temp_hi, &temp_lo);
1129         }
1130         /* LSB of quot is set if inexact which roundandpack will use
1131          * to set flags. Yet again we re-use a for the result */
1132         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133         a.sign = sign;
1134         a.exp = exp;
1135         return a;
1136     }
1137     /* handle all the NaN cases */
1138     if (is_nan(a.cls) || is_nan(b.cls)) {
1139         return pick_nan(a, b, s);
1140     }
1141     /* 0/0 or Inf/Inf */
1142     if (a.cls == b.cls
1143         &&
1144         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145         s->float_exception_flags |= float_flag_invalid;
1146         a.cls = float_class_dnan;
1147         return a;
1148     }
1149     /* Div 0 => Inf */
1150     if (b.cls == float_class_zero) {
1151         s->float_exception_flags |= float_flag_divbyzero;
1152         a.cls = float_class_inf;
1153         a.sign = sign;
1154         return a;
1155     }
1156     /* Inf / x or 0 / x */
1157     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158         a.sign = sign;
1159         return a;
1160     }
1161     /* Div by Inf */
1162     if (b.cls == float_class_inf) {
1163         a.cls = float_class_zero;
1164         a.sign = sign;
1165         return a;
1166     }
1167     g_assert_not_reached();
1168 }
1169
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1171 {
1172     FloatParts pa = float16_unpack_canonical(a, status);
1173     FloatParts pb = float16_unpack_canonical(b, status);
1174     FloatParts pr = div_floats(pa, pb, status);
1175
1176     return float16_round_pack_canonical(pr, status);
1177 }
1178
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1180 {
1181     FloatParts pa = float32_unpack_canonical(a, status);
1182     FloatParts pb = float32_unpack_canonical(b, status);
1183     FloatParts pr = div_floats(pa, pb, status);
1184
1185     return float32_round_pack_canonical(pr, status);
1186 }
1187
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1189 {
1190     FloatParts pa = float64_unpack_canonical(a, status);
1191     FloatParts pb = float64_unpack_canonical(b, status);
1192     FloatParts pr = div_floats(pa, pb, status);
1193
1194     return float64_round_pack_canonical(pr, status);
1195 }
1196
1197 /*
1198  * Rounds the floating-point value `a' to an integer, and returns the
1199  * result as a floating-point value. The operation is performed
1200  * according to the IEC/IEEE Standard for Binary Floating-Point
1201  * Arithmetic.
1202  */
1203
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205 {
1206     if (is_nan(a.cls)) {
1207         return return_nan(a, s);
1208     }
1209
1210     switch (a.cls) {
1211     case float_class_zero:
1212     case float_class_inf:
1213     case float_class_qnan:
1214         /* already "integral" */
1215         break;
1216     case float_class_normal:
1217         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218             /* already integral */
1219             break;
1220         }
1221         if (a.exp < 0) {
1222             bool one;
1223             /* all fractional */
1224             s->float_exception_flags |= float_flag_inexact;
1225             switch (rounding_mode) {
1226             case float_round_nearest_even:
1227                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228                 break;
1229             case float_round_ties_away:
1230                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231                 break;
1232             case float_round_to_zero:
1233                 one = false;
1234                 break;
1235             case float_round_up:
1236                 one = !a.sign;
1237                 break;
1238             case float_round_down:
1239                 one = a.sign;
1240                 break;
1241             default:
1242                 g_assert_not_reached();
1243             }
1244
1245             if (one) {
1246                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247                 a.exp = 0;
1248             } else {
1249                 a.cls = float_class_zero;
1250             }
1251         } else {
1252             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253             uint64_t frac_lsbm1 = frac_lsb >> 1;
1254             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255             uint64_t rnd_mask = rnd_even_mask >> 1;
1256             uint64_t inc;
1257
1258             switch (rounding_mode) {
1259             case float_round_nearest_even:
1260                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261                 break;
1262             case float_round_ties_away:
1263                 inc = frac_lsbm1;
1264                 break;
1265             case float_round_to_zero:
1266                 inc = 0;
1267                 break;
1268             case float_round_up:
1269                 inc = a.sign ? 0 : rnd_mask;
1270                 break;
1271             case float_round_down:
1272                 inc = a.sign ? rnd_mask : 0;
1273                 break;
1274             default:
1275                 g_assert_not_reached();
1276             }
1277
1278             if (a.frac & rnd_mask) {
1279                 s->float_exception_flags |= float_flag_inexact;
1280                 a.frac += inc;
1281                 a.frac &= ~rnd_mask;
1282                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283                     a.frac >>= 1;
1284                     a.exp++;
1285                 }
1286             }
1287         }
1288         break;
1289     default:
1290         g_assert_not_reached();
1291     }
1292     return a;
1293 }
1294
1295 float16 float16_round_to_int(float16 a, float_status *s)
1296 {
1297     FloatParts pa = float16_unpack_canonical(a, s);
1298     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299     return float16_round_pack_canonical(pr, s);
1300 }
1301
1302 float32 float32_round_to_int(float32 a, float_status *s)
1303 {
1304     FloatParts pa = float32_unpack_canonical(a, s);
1305     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306     return float32_round_pack_canonical(pr, s);
1307 }
1308
1309 float64 float64_round_to_int(float64 a, float_status *s)
1310 {
1311     FloatParts pa = float64_unpack_canonical(a, s);
1312     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313     return float64_round_pack_canonical(pr, s);
1314 }
1315
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1317 {
1318     FloatParts pa = float64_unpack_canonical(a, s);
1319     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320     return float64_round_pack_canonical(pr, s);
1321 }
1322
1323 /*
1324  * Returns the result of converting the floating-point value `a' to
1325  * the two's complement integer format. The conversion is performed
1326  * according to the IEC/IEEE Standard for Binary Floating-Point
1327  * Arithmetic---which means in particular that the conversion is
1328  * rounded according to the current rounding mode. If `a' is a NaN,
1329  * the largest positive integer is returned. Otherwise, if the
1330  * conversion overflows, the largest integer with the same sign as `a'
1331  * is returned.
1332 */
1333
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335                                      int64_t min, int64_t max,
1336                                      float_status *s)
1337 {
1338     uint64_t r;
1339     int orig_flags = get_float_exception_flags(s);
1340     FloatParts p = round_to_int(in, rmode, s);
1341
1342     switch (p.cls) {
1343     case float_class_snan:
1344     case float_class_qnan:
1345         return max;
1346     case float_class_inf:
1347         return p.sign ? min : max;
1348     case float_class_zero:
1349         return 0;
1350     case float_class_normal:
1351         if (p.exp < DECOMPOSED_BINARY_POINT) {
1352             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1353         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1354             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1355         } else {
1356             r = UINT64_MAX;
1357         }
1358         if (p.sign) {
1359             if (r < -(uint64_t) min) {
1360                 return -r;
1361             } else {
1362                 s->float_exception_flags = orig_flags | float_flag_invalid;
1363                 return min;
1364             }
1365         } else {
1366             if (r < max) {
1367                 return r;
1368             } else {
1369                 s->float_exception_flags = orig_flags | float_flag_invalid;
1370                 return max;
1371             }
1372         }
1373     default:
1374         g_assert_not_reached();
1375     }
1376 }
1377
1378 #define FLOAT_TO_INT(fsz, isz)                                          \
1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1380                                                 float_status *s)        \
1381 {                                                                       \
1382     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1383     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1384                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1385                                  s);                                    \
1386 }                                                                       \
1387                                                                         \
1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1389  (float ## fsz a, float_status *s)                                      \
1390 {                                                                       \
1391     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1392     return round_to_int_and_pack(p, float_round_to_zero,                \
1393                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1394                                  s);                                    \
1395 }
1396
1397 FLOAT_TO_INT(16, 16)
1398 FLOAT_TO_INT(16, 32)
1399 FLOAT_TO_INT(16, 64)
1400
1401 FLOAT_TO_INT(32, 16)
1402 FLOAT_TO_INT(32, 32)
1403 FLOAT_TO_INT(32, 64)
1404
1405 FLOAT_TO_INT(64, 16)
1406 FLOAT_TO_INT(64, 32)
1407 FLOAT_TO_INT(64, 64)
1408
1409 #undef FLOAT_TO_INT
1410
1411 /*
1412  *  Returns the result of converting the floating-point value `a' to
1413  *  the unsigned integer format. The conversion is performed according
1414  *  to the IEC/IEEE Standard for Binary Floating-Point
1415  *  Arithmetic---which means in particular that the conversion is
1416  *  rounded according to the current rounding mode. If `a' is a NaN,
1417  *  the largest unsigned integer is returned. Otherwise, if the
1418  *  conversion overflows, the largest unsigned integer is returned. If
1419  *  the 'a' is negative, the result is rounded and zero is returned;
1420  *  values that do not round to zero will raise the inexact exception
1421  *  flag.
1422  */
1423
1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1425                                        float_status *s)
1426 {
1427     int orig_flags = get_float_exception_flags(s);
1428     FloatParts p = round_to_int(in, rmode, s);
1429
1430     switch (p.cls) {
1431     case float_class_snan:
1432     case float_class_qnan:
1433         s->float_exception_flags = orig_flags | float_flag_invalid;
1434         return max;
1435     case float_class_inf:
1436         return p.sign ? 0 : max;
1437     case float_class_zero:
1438         return 0;
1439     case float_class_normal:
1440     {
1441         uint64_t r;
1442         if (p.sign) {
1443             s->float_exception_flags = orig_flags | float_flag_invalid;
1444             return 0;
1445         }
1446
1447         if (p.exp < DECOMPOSED_BINARY_POINT) {
1448             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1451         } else {
1452             s->float_exception_flags = orig_flags | float_flag_invalid;
1453             return max;
1454         }
1455
1456         /* For uint64 this will never trip, but if p.exp is too large
1457          * to shift a decomposed fraction we shall have exited via the
1458          * 3rd leg above.
1459          */
1460         if (r > max) {
1461             s->float_exception_flags = orig_flags | float_flag_invalid;
1462             return max;
1463         } else {
1464             return r;
1465         }
1466     }
1467     default:
1468         g_assert_not_reached();
1469     }
1470 }
1471
1472 #define FLOAT_TO_UINT(fsz, isz) \
1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1474                                                   float_status *s)      \
1475 {                                                                       \
1476     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1477     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1478                                  UINT ## isz ## _MAX, s);               \
1479 }                                                                       \
1480                                                                         \
1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1482  (float ## fsz a, float_status *s)                                      \
1483 {                                                                       \
1484     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1485     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1486                                  UINT ## isz ## _MAX, s);               \
1487 }
1488
1489 FLOAT_TO_UINT(16, 16)
1490 FLOAT_TO_UINT(16, 32)
1491 FLOAT_TO_UINT(16, 64)
1492
1493 FLOAT_TO_UINT(32, 16)
1494 FLOAT_TO_UINT(32, 32)
1495 FLOAT_TO_UINT(32, 64)
1496
1497 FLOAT_TO_UINT(64, 16)
1498 FLOAT_TO_UINT(64, 32)
1499 FLOAT_TO_UINT(64, 64)
1500
1501 #undef FLOAT_TO_UINT
1502
1503 /*
1504  * Integer to float conversions
1505  *
1506  * Returns the result of converting the two's complement integer `a'
1507  * to the floating-point format. The conversion is performed according
1508  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1509  */
1510
1511 static FloatParts int_to_float(int64_t a, float_status *status)
1512 {
1513     FloatParts r;
1514     if (a == 0) {
1515         r.cls = float_class_zero;
1516         r.sign = false;
1517     } else if (a == (1ULL << 63)) {
1518         r.cls = float_class_normal;
1519         r.sign = true;
1520         r.frac = DECOMPOSED_IMPLICIT_BIT;
1521         r.exp = 63;
1522     } else {
1523         uint64_t f;
1524         if (a < 0) {
1525             f = -a;
1526             r.sign = true;
1527         } else {
1528             f = a;
1529             r.sign = false;
1530         }
1531         int shift = clz64(f) - 1;
1532         r.cls = float_class_normal;
1533         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1534         r.frac = f << shift;
1535     }
1536
1537     return r;
1538 }
1539
1540 float16 int64_to_float16(int64_t a, float_status *status)
1541 {
1542     FloatParts pa = int_to_float(a, status);
1543     return float16_round_pack_canonical(pa, status);
1544 }
1545
1546 float16 int32_to_float16(int32_t a, float_status *status)
1547 {
1548     return int64_to_float16(a, status);
1549 }
1550
1551 float16 int16_to_float16(int16_t a, float_status *status)
1552 {
1553     return int64_to_float16(a, status);
1554 }
1555
1556 float32 int64_to_float32(int64_t a, float_status *status)
1557 {
1558     FloatParts pa = int_to_float(a, status);
1559     return float32_round_pack_canonical(pa, status);
1560 }
1561
1562 float32 int32_to_float32(int32_t a, float_status *status)
1563 {
1564     return int64_to_float32(a, status);
1565 }
1566
1567 float32 int16_to_float32(int16_t a, float_status *status)
1568 {
1569     return int64_to_float32(a, status);
1570 }
1571
1572 float64 int64_to_float64(int64_t a, float_status *status)
1573 {
1574     FloatParts pa = int_to_float(a, status);
1575     return float64_round_pack_canonical(pa, status);
1576 }
1577
1578 float64 int32_to_float64(int32_t a, float_status *status)
1579 {
1580     return int64_to_float64(a, status);
1581 }
1582
1583 float64 int16_to_float64(int16_t a, float_status *status)
1584 {
1585     return int64_to_float64(a, status);
1586 }
1587
1588
1589 /*
1590  * Unsigned Integer to float conversions
1591  *
1592  * Returns the result of converting the unsigned integer `a' to the
1593  * floating-point format. The conversion is performed according to the
1594  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1595  */
1596
1597 static FloatParts uint_to_float(uint64_t a, float_status *status)
1598 {
1599     FloatParts r = { .sign = false};
1600
1601     if (a == 0) {
1602         r.cls = float_class_zero;
1603     } else {
1604         int spare_bits = clz64(a) - 1;
1605         r.cls = float_class_normal;
1606         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1607         if (spare_bits < 0) {
1608             shift64RightJamming(a, -spare_bits, &a);
1609             r.frac = a;
1610         } else {
1611             r.frac = a << spare_bits;
1612         }
1613     }
1614
1615     return r;
1616 }
1617
1618 float16 uint64_to_float16(uint64_t a, float_status *status)
1619 {
1620     FloatParts pa = uint_to_float(a, status);
1621     return float16_round_pack_canonical(pa, status);
1622 }
1623
1624 float16 uint32_to_float16(uint32_t a, float_status *status)
1625 {
1626     return uint64_to_float16(a, status);
1627 }
1628
1629 float16 uint16_to_float16(uint16_t a, float_status *status)
1630 {
1631     return uint64_to_float16(a, status);
1632 }
1633
1634 float32 uint64_to_float32(uint64_t a, float_status *status)
1635 {
1636     FloatParts pa = uint_to_float(a, status);
1637     return float32_round_pack_canonical(pa, status);
1638 }
1639
1640 float32 uint32_to_float32(uint32_t a, float_status *status)
1641 {
1642     return uint64_to_float32(a, status);
1643 }
1644
1645 float32 uint16_to_float32(uint16_t a, float_status *status)
1646 {
1647     return uint64_to_float32(a, status);
1648 }
1649
1650 float64 uint64_to_float64(uint64_t a, float_status *status)
1651 {
1652     FloatParts pa = uint_to_float(a, status);
1653     return float64_round_pack_canonical(pa, status);
1654 }
1655
1656 float64 uint32_to_float64(uint32_t a, float_status *status)
1657 {
1658     return uint64_to_float64(a, status);
1659 }
1660
1661 float64 uint16_to_float64(uint16_t a, float_status *status)
1662 {
1663     return uint64_to_float64(a, status);
1664 }
1665
1666 /* Float Min/Max */
1667 /* min() and max() functions. These can't be implemented as
1668  * 'compare and pick one input' because that would mishandle
1669  * NaNs and +0 vs -0.
1670  *
1671  * minnum() and maxnum() functions. These are similar to the min()
1672  * and max() functions but if one of the arguments is a QNaN and
1673  * the other is numerical then the numerical argument is returned.
1674  * SNaNs will get quietened before being returned.
1675  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1676  * and maxNum() operations. min() and max() are the typical min/max
1677  * semantics provided by many CPUs which predate that specification.
1678  *
1679  * minnummag() and maxnummag() functions correspond to minNumMag()
1680  * and minNumMag() from the IEEE-754 2008.
1681  */
1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1683                                 bool ieee, bool ismag, float_status *s)
1684 {
1685     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1686         if (ieee) {
1687             /* Takes two floating-point values `a' and `b', one of
1688              * which is a NaN, and returns the appropriate NaN
1689              * result. If either `a' or `b' is a signaling NaN,
1690              * the invalid exception is raised.
1691              */
1692             if (is_snan(a.cls) || is_snan(b.cls)) {
1693                 return pick_nan(a, b, s);
1694             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1695                 return b;
1696             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1697                 return a;
1698             }
1699         }
1700         return pick_nan(a, b, s);
1701     } else {
1702         int a_exp, b_exp;
1703         bool a_sign, b_sign;
1704
1705         switch (a.cls) {
1706         case float_class_normal:
1707             a_exp = a.exp;
1708             break;
1709         case float_class_inf:
1710             a_exp = INT_MAX;
1711             break;
1712         case float_class_zero:
1713             a_exp = INT_MIN;
1714             break;
1715         default:
1716             g_assert_not_reached();
1717             break;
1718         }
1719         switch (b.cls) {
1720         case float_class_normal:
1721             b_exp = b.exp;
1722             break;
1723         case float_class_inf:
1724             b_exp = INT_MAX;
1725             break;
1726         case float_class_zero:
1727             b_exp = INT_MIN;
1728             break;
1729         default:
1730             g_assert_not_reached();
1731             break;
1732         }
1733
1734         a_sign = a.sign;
1735         b_sign = b.sign;
1736         if (ismag) {
1737             a_sign = b_sign = 0;
1738         }
1739
1740         if (a_sign == b_sign) {
1741             bool a_less = a_exp < b_exp;
1742             if (a_exp == b_exp) {
1743                 a_less = a.frac < b.frac;
1744             }
1745             return a_sign ^ a_less ^ ismin ? b : a;
1746         } else {
1747             return a_sign ^ ismin ? b : a;
1748         }
1749     }
1750 }
1751
1752 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1754                                      float_status *s)                   \
1755 {                                                                       \
1756     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1757     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1758     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1759                                                                         \
1760     return float ## sz ## _round_pack_canonical(pr, s);                 \
1761 }
1762
1763 MINMAX(16, min, true, false, false)
1764 MINMAX(16, minnum, true, true, false)
1765 MINMAX(16, minnummag, true, true, true)
1766 MINMAX(16, max, false, false, false)
1767 MINMAX(16, maxnum, false, true, false)
1768 MINMAX(16, maxnummag, false, true, true)
1769
1770 MINMAX(32, min, true, false, false)
1771 MINMAX(32, minnum, true, true, false)
1772 MINMAX(32, minnummag, true, true, true)
1773 MINMAX(32, max, false, false, false)
1774 MINMAX(32, maxnum, false, true, false)
1775 MINMAX(32, maxnummag, false, true, true)
1776
1777 MINMAX(64, min, true, false, false)
1778 MINMAX(64, minnum, true, true, false)
1779 MINMAX(64, minnummag, true, true, true)
1780 MINMAX(64, max, false, false, false)
1781 MINMAX(64, maxnum, false, true, false)
1782 MINMAX(64, maxnummag, false, true, true)
1783
1784 #undef MINMAX
1785
1786 /* Multiply A by 2 raised to the power N.  */
1787 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1788 {
1789     if (unlikely(is_nan(a.cls))) {
1790         return return_nan(a, s);
1791     }
1792     if (a.cls == float_class_normal) {
1793         a.exp += n;
1794     }
1795     return a;
1796 }
1797
1798 float16 float16_scalbn(float16 a, int n, float_status *status)
1799 {
1800     FloatParts pa = float16_unpack_canonical(a, status);
1801     FloatParts pr = scalbn_decomposed(pa, n, status);
1802     return float16_round_pack_canonical(pr, status);
1803 }
1804
1805 float32 float32_scalbn(float32 a, int n, float_status *status)
1806 {
1807     FloatParts pa = float32_unpack_canonical(a, status);
1808     FloatParts pr = scalbn_decomposed(pa, n, status);
1809     return float32_round_pack_canonical(pr, status);
1810 }
1811
1812 float64 float64_scalbn(float64 a, int n, float_status *status)
1813 {
1814     FloatParts pa = float64_unpack_canonical(a, status);
1815     FloatParts pr = scalbn_decomposed(pa, n, status);
1816     return float64_round_pack_canonical(pr, status);
1817 }
1818
1819 /*----------------------------------------------------------------------------
1820 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1821 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1822 | input.  If `zSign' is 1, the input is negated before being converted to an
1823 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
1824 | is simply rounded to an integer, with the inexact exception raised if the
1825 | input cannot be represented exactly as an integer.  However, if the fixed-
1826 | point input is too large, the invalid exception is raised and the largest
1827 | positive or negative integer is returned.
1828 *----------------------------------------------------------------------------*/
1829
1830 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
1831 {
1832     int8_t roundingMode;
1833     flag roundNearestEven;
1834     int8_t roundIncrement, roundBits;
1835     int32_t z;
1836
1837     roundingMode = status->float_rounding_mode;
1838     roundNearestEven = ( roundingMode == float_round_nearest_even );
1839     switch (roundingMode) {
1840     case float_round_nearest_even:
1841     case float_round_ties_away:
1842         roundIncrement = 0x40;
1843         break;
1844     case float_round_to_zero:
1845         roundIncrement = 0;
1846         break;
1847     case float_round_up:
1848         roundIncrement = zSign ? 0 : 0x7f;
1849         break;
1850     case float_round_down:
1851         roundIncrement = zSign ? 0x7f : 0;
1852         break;
1853     default:
1854         abort();
1855     }
1856     roundBits = absZ & 0x7F;
1857     absZ = ( absZ + roundIncrement )>>7;
1858     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
1859     z = absZ;
1860     if ( zSign ) z = - z;
1861     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
1862         float_raise(float_flag_invalid, status);
1863         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
1864     }
1865     if (roundBits) {
1866         status->float_exception_flags |= float_flag_inexact;
1867     }
1868     return z;
1869
1870 }
1871
1872 /*----------------------------------------------------------------------------
1873 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1874 | `absZ1', with binary point between bits 63 and 64 (between the input words),
1875 | and returns the properly rounded 64-bit integer corresponding to the input.
1876 | If `zSign' is 1, the input is negated before being converted to an integer.
1877 | Ordinarily, the fixed-point input is simply rounded to an integer, with
1878 | the inexact exception raised if the input cannot be represented exactly as
1879 | an integer.  However, if the fixed-point input is too large, the invalid
1880 | exception is raised and the largest positive or negative integer is
1881 | returned.
1882 *----------------------------------------------------------------------------*/
1883
1884 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
1885                                float_status *status)
1886 {
1887     int8_t roundingMode;
1888     flag roundNearestEven, increment;
1889     int64_t z;
1890
1891     roundingMode = status->float_rounding_mode;
1892     roundNearestEven = ( roundingMode == float_round_nearest_even );
1893     switch (roundingMode) {
1894     case float_round_nearest_even:
1895     case float_round_ties_away:
1896         increment = ((int64_t) absZ1 < 0);
1897         break;
1898     case float_round_to_zero:
1899         increment = 0;
1900         break;
1901     case float_round_up:
1902         increment = !zSign && absZ1;
1903         break;
1904     case float_round_down:
1905         increment = zSign && absZ1;
1906         break;
1907     default:
1908         abort();
1909     }
1910     if ( increment ) {
1911         ++absZ0;
1912         if ( absZ0 == 0 ) goto overflow;
1913         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
1914     }
1915     z = absZ0;
1916     if ( zSign ) z = - z;
1917     if ( z && ( ( z < 0 ) ^ zSign ) ) {
1918  overflow:
1919         float_raise(float_flag_invalid, status);
1920         return
1921               zSign ? (int64_t) LIT64( 0x8000000000000000 )
1922             : LIT64( 0x7FFFFFFFFFFFFFFF );
1923     }
1924     if (absZ1) {
1925         status->float_exception_flags |= float_flag_inexact;
1926     }
1927     return z;
1928
1929 }
1930
1931 /*----------------------------------------------------------------------------
1932 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
1933 | `absZ1', with binary point between bits 63 and 64 (between the input words),
1934 | and returns the properly rounded 64-bit unsigned integer corresponding to the
1935 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
1936 | with the inexact exception raised if the input cannot be represented exactly
1937 | as an integer.  However, if the fixed-point input is too large, the invalid
1938 | exception is raised and the largest unsigned integer is returned.
1939 *----------------------------------------------------------------------------*/
1940
1941 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
1942                                 uint64_t absZ1, float_status *status)
1943 {
1944     int8_t roundingMode;
1945     flag roundNearestEven, increment;
1946
1947     roundingMode = status->float_rounding_mode;
1948     roundNearestEven = (roundingMode == float_round_nearest_even);
1949     switch (roundingMode) {
1950     case float_round_nearest_even:
1951     case float_round_ties_away:
1952         increment = ((int64_t)absZ1 < 0);
1953         break;
1954     case float_round_to_zero:
1955         increment = 0;
1956         break;
1957     case float_round_up:
1958         increment = !zSign && absZ1;
1959         break;
1960     case float_round_down:
1961         increment = zSign && absZ1;
1962         break;
1963     default:
1964         abort();
1965     }
1966     if (increment) {
1967         ++absZ0;
1968         if (absZ0 == 0) {
1969             float_raise(float_flag_invalid, status);
1970             return LIT64(0xFFFFFFFFFFFFFFFF);
1971         }
1972         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
1973     }
1974
1975     if (zSign && absZ0) {
1976         float_raise(float_flag_invalid, status);
1977         return 0;
1978     }
1979
1980     if (absZ1) {
1981         status->float_exception_flags |= float_flag_inexact;
1982     }
1983     return absZ0;
1984 }
1985
1986 /*----------------------------------------------------------------------------
1987 | If `a' is denormal and we are in flush-to-zero mode then set the
1988 | input-denormal exception and return zero. Otherwise just return the value.
1989 *----------------------------------------------------------------------------*/
1990 float32 float32_squash_input_denormal(float32 a, float_status *status)
1991 {
1992     if (status->flush_inputs_to_zero) {
1993         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
1994             float_raise(float_flag_input_denormal, status);
1995             return make_float32(float32_val(a) & 0x80000000);
1996         }
1997     }
1998     return a;
1999 }
2000
2001 /*----------------------------------------------------------------------------
2002 | Normalizes the subnormal single-precision floating-point value represented
2003 | by the denormalized significand `aSig'.  The normalized exponent and
2004 | significand are stored at the locations pointed to by `zExpPtr' and
2005 | `zSigPtr', respectively.
2006 *----------------------------------------------------------------------------*/
2007
2008 static void
2009  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2010 {
2011     int8_t shiftCount;
2012
2013     shiftCount = countLeadingZeros32( aSig ) - 8;
2014     *zSigPtr = aSig<<shiftCount;
2015     *zExpPtr = 1 - shiftCount;
2016
2017 }
2018
2019 /*----------------------------------------------------------------------------
2020 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2021 | single-precision floating-point value, returning the result.  After being
2022 | shifted into the proper positions, the three fields are simply added
2023 | together to form the result.  This means that any integer portion of `zSig'
2024 | will be added into the exponent.  Since a properly normalized significand
2025 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2026 | than the desired result exponent whenever `zSig' is a complete, normalized
2027 | significand.
2028 *----------------------------------------------------------------------------*/
2029
2030 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
2031 {
2032
2033     return make_float32(
2034           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
2035
2036 }
2037
2038 /*----------------------------------------------------------------------------
2039 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2040 | and significand `zSig', and returns the proper single-precision floating-
2041 | point value corresponding to the abstract input.  Ordinarily, the abstract
2042 | value is simply rounded and packed into the single-precision format, with
2043 | the inexact exception raised if the abstract input cannot be represented
2044 | exactly.  However, if the abstract value is too large, the overflow and
2045 | inexact exceptions are raised and an infinity or maximal finite value is
2046 | returned.  If the abstract value is too small, the input value is rounded to
2047 | a subnormal number, and the underflow and inexact exceptions are raised if
2048 | the abstract input cannot be represented exactly as a subnormal single-
2049 | precision floating-point number.
2050 |     The input significand `zSig' has its binary point between bits 30
2051 | and 29, which is 7 bits to the left of the usual location.  This shifted
2052 | significand must be normalized or smaller.  If `zSig' is not normalized,
2053 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2054 | and it must not require rounding.  In the usual case that `zSig' is
2055 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2056 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2057 | Binary Floating-Point Arithmetic.
2058 *----------------------------------------------------------------------------*/
2059
2060 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2061                                    float_status *status)
2062 {
2063     int8_t roundingMode;
2064     flag roundNearestEven;
2065     int8_t roundIncrement, roundBits;
2066     flag isTiny;
2067
2068     roundingMode = status->float_rounding_mode;
2069     roundNearestEven = ( roundingMode == float_round_nearest_even );
2070     switch (roundingMode) {
2071     case float_round_nearest_even:
2072     case float_round_ties_away:
2073         roundIncrement = 0x40;
2074         break;
2075     case float_round_to_zero:
2076         roundIncrement = 0;
2077         break;
2078     case float_round_up:
2079         roundIncrement = zSign ? 0 : 0x7f;
2080         break;
2081     case float_round_down:
2082         roundIncrement = zSign ? 0x7f : 0;
2083         break;
2084     default:
2085         abort();
2086         break;
2087     }
2088     roundBits = zSig & 0x7F;
2089     if ( 0xFD <= (uint16_t) zExp ) {
2090         if (    ( 0xFD < zExp )
2091              || (    ( zExp == 0xFD )
2092                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2093            ) {
2094             float_raise(float_flag_overflow | float_flag_inexact, status);
2095             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2096         }
2097         if ( zExp < 0 ) {
2098             if (status->flush_to_zero) {
2099                 float_raise(float_flag_output_denormal, status);
2100                 return packFloat32(zSign, 0, 0);
2101             }
2102             isTiny =
2103                 (status->float_detect_tininess
2104                  == float_tininess_before_rounding)
2105                 || ( zExp < -1 )
2106                 || ( zSig + roundIncrement < 0x80000000 );
2107             shift32RightJamming( zSig, - zExp, &zSig );
2108             zExp = 0;
2109             roundBits = zSig & 0x7F;
2110             if (isTiny && roundBits) {
2111                 float_raise(float_flag_underflow, status);
2112             }
2113         }
2114     }
2115     if (roundBits) {
2116         status->float_exception_flags |= float_flag_inexact;
2117     }
2118     zSig = ( zSig + roundIncrement )>>7;
2119     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2120     if ( zSig == 0 ) zExp = 0;
2121     return packFloat32( zSign, zExp, zSig );
2122
2123 }
2124
2125 /*----------------------------------------------------------------------------
2126 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2127 | and significand `zSig', and returns the proper single-precision floating-
2128 | point value corresponding to the abstract input.  This routine is just like
2129 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2130 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2131 | floating-point exponent.
2132 *----------------------------------------------------------------------------*/
2133
2134 static float32
2135  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2136                               float_status *status)
2137 {
2138     int8_t shiftCount;
2139
2140     shiftCount = countLeadingZeros32( zSig ) - 1;
2141     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2142                                status);
2143
2144 }
2145
2146 /*----------------------------------------------------------------------------
2147 | If `a' is denormal and we are in flush-to-zero mode then set the
2148 | input-denormal exception and return zero. Otherwise just return the value.
2149 *----------------------------------------------------------------------------*/
2150 float64 float64_squash_input_denormal(float64 a, float_status *status)
2151 {
2152     if (status->flush_inputs_to_zero) {
2153         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2154             float_raise(float_flag_input_denormal, status);
2155             return make_float64(float64_val(a) & (1ULL << 63));
2156         }
2157     }
2158     return a;
2159 }
2160
2161 /*----------------------------------------------------------------------------
2162 | Normalizes the subnormal double-precision floating-point value represented
2163 | by the denormalized significand `aSig'.  The normalized exponent and
2164 | significand are stored at the locations pointed to by `zExpPtr' and
2165 | `zSigPtr', respectively.
2166 *----------------------------------------------------------------------------*/
2167
2168 static void
2169  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2170 {
2171     int8_t shiftCount;
2172
2173     shiftCount = countLeadingZeros64( aSig ) - 11;
2174     *zSigPtr = aSig<<shiftCount;
2175     *zExpPtr = 1 - shiftCount;
2176
2177 }
2178
2179 /*----------------------------------------------------------------------------
2180 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2181 | double-precision floating-point value, returning the result.  After being
2182 | shifted into the proper positions, the three fields are simply added
2183 | together to form the result.  This means that any integer portion of `zSig'
2184 | will be added into the exponent.  Since a properly normalized significand
2185 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2186 | than the desired result exponent whenever `zSig' is a complete, normalized
2187 | significand.
2188 *----------------------------------------------------------------------------*/
2189
2190 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2191 {
2192
2193     return make_float64(
2194         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2195
2196 }
2197
2198 /*----------------------------------------------------------------------------
2199 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2200 | and significand `zSig', and returns the proper double-precision floating-
2201 | point value corresponding to the abstract input.  Ordinarily, the abstract
2202 | value is simply rounded and packed into the double-precision format, with
2203 | the inexact exception raised if the abstract input cannot be represented
2204 | exactly.  However, if the abstract value is too large, the overflow and
2205 | inexact exceptions are raised and an infinity or maximal finite value is
2206 | returned.  If the abstract value is too small, the input value is rounded to
2207 | a subnormal number, and the underflow and inexact exceptions are raised if
2208 | the abstract input cannot be represented exactly as a subnormal double-
2209 | precision floating-point number.
2210 |     The input significand `zSig' has its binary point between bits 62
2211 | and 61, which is 10 bits to the left of the usual location.  This shifted
2212 | significand must be normalized or smaller.  If `zSig' is not normalized,
2213 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2214 | and it must not require rounding.  In the usual case that `zSig' is
2215 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2216 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2217 | Binary Floating-Point Arithmetic.
2218 *----------------------------------------------------------------------------*/
2219
2220 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2221                                    float_status *status)
2222 {
2223     int8_t roundingMode;
2224     flag roundNearestEven;
2225     int roundIncrement, roundBits;
2226     flag isTiny;
2227
2228     roundingMode = status->float_rounding_mode;
2229     roundNearestEven = ( roundingMode == float_round_nearest_even );
2230     switch (roundingMode) {
2231     case float_round_nearest_even:
2232     case float_round_ties_away:
2233         roundIncrement = 0x200;
2234         break;
2235     case float_round_to_zero:
2236         roundIncrement = 0;
2237         break;
2238     case float_round_up:
2239         roundIncrement = zSign ? 0 : 0x3ff;
2240         break;
2241     case float_round_down:
2242         roundIncrement = zSign ? 0x3ff : 0;
2243         break;
2244     case float_round_to_odd:
2245         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2246         break;
2247     default:
2248         abort();
2249     }
2250     roundBits = zSig & 0x3FF;
2251     if ( 0x7FD <= (uint16_t) zExp ) {
2252         if (    ( 0x7FD < zExp )
2253              || (    ( zExp == 0x7FD )
2254                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2255            ) {
2256             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2257                                    roundIncrement != 0;
2258             float_raise(float_flag_overflow | float_flag_inexact, status);
2259             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2260         }
2261         if ( zExp < 0 ) {
2262             if (status->flush_to_zero) {
2263                 float_raise(float_flag_output_denormal, status);
2264                 return packFloat64(zSign, 0, 0);
2265             }
2266             isTiny =
2267                    (status->float_detect_tininess
2268                     == float_tininess_before_rounding)
2269                 || ( zExp < -1 )
2270                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2271             shift64RightJamming( zSig, - zExp, &zSig );
2272             zExp = 0;
2273             roundBits = zSig & 0x3FF;
2274             if (isTiny && roundBits) {
2275                 float_raise(float_flag_underflow, status);
2276             }
2277             if (roundingMode == float_round_to_odd) {
2278                 /*
2279                  * For round-to-odd case, the roundIncrement depends on
2280                  * zSig which just changed.
2281                  */
2282                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2283             }
2284         }
2285     }
2286     if (roundBits) {
2287         status->float_exception_flags |= float_flag_inexact;
2288     }
2289     zSig = ( zSig + roundIncrement )>>10;
2290     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2291     if ( zSig == 0 ) zExp = 0;
2292     return packFloat64( zSign, zExp, zSig );
2293
2294 }
2295
2296 /*----------------------------------------------------------------------------
2297 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2298 | and significand `zSig', and returns the proper double-precision floating-
2299 | point value corresponding to the abstract input.  This routine is just like
2300 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2301 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2302 | floating-point exponent.
2303 *----------------------------------------------------------------------------*/
2304
2305 static float64
2306  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2307                               float_status *status)
2308 {
2309     int8_t shiftCount;
2310
2311     shiftCount = countLeadingZeros64( zSig ) - 1;
2312     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2313                                status);
2314
2315 }
2316
2317 /*----------------------------------------------------------------------------
2318 | Returns the fraction bits of the extended double-precision floating-point
2319 | value `a'.
2320 *----------------------------------------------------------------------------*/
2321
2322 static inline uint64_t extractFloatx80Frac( floatx80 a )
2323 {
2324
2325     return a.low;
2326
2327 }
2328
2329 /*----------------------------------------------------------------------------
2330 | Returns the exponent bits of the extended double-precision floating-point
2331 | value `a'.
2332 *----------------------------------------------------------------------------*/
2333
2334 static inline int32_t extractFloatx80Exp( floatx80 a )
2335 {
2336
2337     return a.high & 0x7FFF;
2338
2339 }
2340
2341 /*----------------------------------------------------------------------------
2342 | Returns the sign bit of the extended double-precision floating-point value
2343 | `a'.
2344 *----------------------------------------------------------------------------*/
2345
2346 static inline flag extractFloatx80Sign( floatx80 a )
2347 {
2348
2349     return a.high>>15;
2350
2351 }
2352
2353 /*----------------------------------------------------------------------------
2354 | Normalizes the subnormal extended double-precision floating-point value
2355 | represented by the denormalized significand `aSig'.  The normalized exponent
2356 | and significand are stored at the locations pointed to by `zExpPtr' and
2357 | `zSigPtr', respectively.
2358 *----------------------------------------------------------------------------*/
2359
2360 static void
2361  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
2362 {
2363     int8_t shiftCount;
2364
2365     shiftCount = countLeadingZeros64( aSig );
2366     *zSigPtr = aSig<<shiftCount;
2367     *zExpPtr = 1 - shiftCount;
2368
2369 }
2370
2371 /*----------------------------------------------------------------------------
2372 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
2373 | extended double-precision floating-point value, returning the result.
2374 *----------------------------------------------------------------------------*/
2375
2376 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
2377 {
2378     floatx80 z;
2379
2380     z.low = zSig;
2381     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
2382     return z;
2383
2384 }
2385
2386 /*----------------------------------------------------------------------------
2387 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2388 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2389 | and returns the proper extended double-precision floating-point value
2390 | corresponding to the abstract input.  Ordinarily, the abstract value is
2391 | rounded and packed into the extended double-precision format, with the
2392 | inexact exception raised if the abstract input cannot be represented
2393 | exactly.  However, if the abstract value is too large, the overflow and
2394 | inexact exceptions are raised and an infinity or maximal finite value is
2395 | returned.  If the abstract value is too small, the input value is rounded to
2396 | a subnormal number, and the underflow and inexact exceptions are raised if
2397 | the abstract input cannot be represented exactly as a subnormal extended
2398 | double-precision floating-point number.
2399 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2400 | number of bits as single or double precision, respectively.  Otherwise, the
2401 | result is rounded to the full precision of the extended double-precision
2402 | format.
2403 |     The input significand must be normalized or smaller.  If the input
2404 | significand is not normalized, `zExp' must be 0; in that case, the result
2405 | returned is a subnormal number, and it must not require rounding.  The
2406 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2407 | Floating-Point Arithmetic.
2408 *----------------------------------------------------------------------------*/
2409
2410 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2411                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2412                                      float_status *status)
2413 {
2414     int8_t roundingMode;
2415     flag roundNearestEven, increment, isTiny;
2416     int64_t roundIncrement, roundMask, roundBits;
2417
2418     roundingMode = status->float_rounding_mode;
2419     roundNearestEven = ( roundingMode == float_round_nearest_even );
2420     if ( roundingPrecision == 80 ) goto precision80;
2421     if ( roundingPrecision == 64 ) {
2422         roundIncrement = LIT64( 0x0000000000000400 );
2423         roundMask = LIT64( 0x00000000000007FF );
2424     }
2425     else if ( roundingPrecision == 32 ) {
2426         roundIncrement = LIT64( 0x0000008000000000 );
2427         roundMask = LIT64( 0x000000FFFFFFFFFF );
2428     }
2429     else {
2430         goto precision80;
2431     }
2432     zSig0 |= ( zSig1 != 0 );
2433     switch (roundingMode) {
2434     case float_round_nearest_even:
2435     case float_round_ties_away:
2436         break;
2437     case float_round_to_zero:
2438         roundIncrement = 0;
2439         break;
2440     case float_round_up:
2441         roundIncrement = zSign ? 0 : roundMask;
2442         break;
2443     case float_round_down:
2444         roundIncrement = zSign ? roundMask : 0;
2445         break;
2446     default:
2447         abort();
2448     }
2449     roundBits = zSig0 & roundMask;
2450     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2451         if (    ( 0x7FFE < zExp )
2452              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2453            ) {
2454             goto overflow;
2455         }
2456         if ( zExp <= 0 ) {
2457             if (status->flush_to_zero) {
2458                 float_raise(float_flag_output_denormal, status);
2459                 return packFloatx80(zSign, 0, 0);
2460             }
2461             isTiny =
2462                    (status->float_detect_tininess
2463                     == float_tininess_before_rounding)
2464                 || ( zExp < 0 )
2465                 || ( zSig0 <= zSig0 + roundIncrement );
2466             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2467             zExp = 0;
2468             roundBits = zSig0 & roundMask;
2469             if (isTiny && roundBits) {
2470                 float_raise(float_flag_underflow, status);
2471             }
2472             if (roundBits) {
2473                 status->float_exception_flags |= float_flag_inexact;
2474             }
2475             zSig0 += roundIncrement;
2476             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2477             roundIncrement = roundMask + 1;
2478             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2479                 roundMask |= roundIncrement;
2480             }
2481             zSig0 &= ~ roundMask;
2482             return packFloatx80( zSign, zExp, zSig0 );
2483         }
2484     }
2485     if (roundBits) {
2486         status->float_exception_flags |= float_flag_inexact;
2487     }
2488     zSig0 += roundIncrement;
2489     if ( zSig0 < roundIncrement ) {
2490         ++zExp;
2491         zSig0 = LIT64( 0x8000000000000000 );
2492     }
2493     roundIncrement = roundMask + 1;
2494     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2495         roundMask |= roundIncrement;
2496     }
2497     zSig0 &= ~ roundMask;
2498     if ( zSig0 == 0 ) zExp = 0;
2499     return packFloatx80( zSign, zExp, zSig0 );
2500  precision80:
2501     switch (roundingMode) {
2502     case float_round_nearest_even:
2503     case float_round_ties_away:
2504         increment = ((int64_t)zSig1 < 0);
2505         break;
2506     case float_round_to_zero:
2507         increment = 0;
2508         break;
2509     case float_round_up:
2510         increment = !zSign && zSig1;
2511         break;
2512     case float_round_down:
2513         increment = zSign && zSig1;
2514         break;
2515     default:
2516         abort();
2517     }
2518     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2519         if (    ( 0x7FFE < zExp )
2520              || (    ( zExp == 0x7FFE )
2521                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2522                   && increment
2523                 )
2524            ) {
2525             roundMask = 0;
2526  overflow:
2527             float_raise(float_flag_overflow | float_flag_inexact, status);
2528             if (    ( roundingMode == float_round_to_zero )
2529                  || ( zSign && ( roundingMode == float_round_up ) )
2530                  || ( ! zSign && ( roundingMode == float_round_down ) )
2531                ) {
2532                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2533             }
2534             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2535         }
2536         if ( zExp <= 0 ) {
2537             isTiny =
2538                    (status->float_detect_tininess
2539                     == float_tininess_before_rounding)
2540                 || ( zExp < 0 )
2541                 || ! increment
2542                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2543             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2544             zExp = 0;
2545             if (isTiny && zSig1) {
2546                 float_raise(float_flag_underflow, status);
2547             }
2548             if (zSig1) {
2549                 status->float_exception_flags |= float_flag_inexact;
2550             }
2551             switch (roundingMode) {
2552             case float_round_nearest_even:
2553             case float_round_ties_away:
2554                 increment = ((int64_t)zSig1 < 0);
2555                 break;
2556             case float_round_to_zero:
2557                 increment = 0;
2558                 break;
2559             case float_round_up:
2560                 increment = !zSign && zSig1;
2561                 break;
2562             case float_round_down:
2563                 increment = zSign && zSig1;
2564                 break;
2565             default:
2566                 abort();
2567             }
2568             if ( increment ) {
2569                 ++zSig0;
2570                 zSig0 &=
2571                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2572                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2573             }
2574             return packFloatx80( zSign, zExp, zSig0 );
2575         }
2576     }
2577     if (zSig1) {
2578         status->float_exception_flags |= float_flag_inexact;
2579     }
2580     if ( increment ) {
2581         ++zSig0;
2582         if ( zSig0 == 0 ) {
2583             ++zExp;
2584             zSig0 = LIT64( 0x8000000000000000 );
2585         }
2586         else {
2587             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2588         }
2589     }
2590     else {
2591         if ( zSig0 == 0 ) zExp = 0;
2592     }
2593     return packFloatx80( zSign, zExp, zSig0 );
2594
2595 }
2596
2597 /*----------------------------------------------------------------------------
2598 | Takes an abstract floating-point value having sign `zSign', exponent
2599 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2600 | and returns the proper extended double-precision floating-point value
2601 | corresponding to the abstract input.  This routine is just like
2602 | `roundAndPackFloatx80' except that the input significand does not have to be
2603 | normalized.
2604 *----------------------------------------------------------------------------*/
2605
2606 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2607                                               flag zSign, int32_t zExp,
2608                                               uint64_t zSig0, uint64_t zSig1,
2609                                               float_status *status)
2610 {
2611     int8_t shiftCount;
2612
2613     if ( zSig0 == 0 ) {
2614         zSig0 = zSig1;
2615         zSig1 = 0;
2616         zExp -= 64;
2617     }
2618     shiftCount = countLeadingZeros64( zSig0 );
2619     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2620     zExp -= shiftCount;
2621     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2622                                 zSig0, zSig1, status);
2623
2624 }
2625
2626 /*----------------------------------------------------------------------------
2627 | Returns the least-significant 64 fraction bits of the quadruple-precision
2628 | floating-point value `a'.
2629 *----------------------------------------------------------------------------*/
2630
2631 static inline uint64_t extractFloat128Frac1( float128 a )
2632 {
2633
2634     return a.low;
2635
2636 }
2637
2638 /*----------------------------------------------------------------------------
2639 | Returns the most-significant 48 fraction bits of the quadruple-precision
2640 | floating-point value `a'.
2641 *----------------------------------------------------------------------------*/
2642
2643 static inline uint64_t extractFloat128Frac0( float128 a )
2644 {
2645
2646     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2647
2648 }
2649
2650 /*----------------------------------------------------------------------------
2651 | Returns the exponent bits of the quadruple-precision floating-point value
2652 | `a'.
2653 *----------------------------------------------------------------------------*/
2654
2655 static inline int32_t extractFloat128Exp( float128 a )
2656 {
2657
2658     return ( a.high>>48 ) & 0x7FFF;
2659
2660 }
2661
2662 /*----------------------------------------------------------------------------
2663 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2664 *----------------------------------------------------------------------------*/
2665
2666 static inline flag extractFloat128Sign( float128 a )
2667 {
2668
2669     return a.high>>63;
2670
2671 }
2672
2673 /*----------------------------------------------------------------------------
2674 | Normalizes the subnormal quadruple-precision floating-point value
2675 | represented by the denormalized significand formed by the concatenation of
2676 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2677 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2678 | significand are stored at the location pointed to by `zSig0Ptr', and the
2679 | least significant 64 bits of the normalized significand are stored at the
2680 | location pointed to by `zSig1Ptr'.
2681 *----------------------------------------------------------------------------*/
2682
2683 static void
2684  normalizeFloat128Subnormal(
2685      uint64_t aSig0,
2686      uint64_t aSig1,
2687      int32_t *zExpPtr,
2688      uint64_t *zSig0Ptr,
2689      uint64_t *zSig1Ptr
2690  )
2691 {
2692     int8_t shiftCount;
2693
2694     if ( aSig0 == 0 ) {
2695         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2696         if ( shiftCount < 0 ) {
2697             *zSig0Ptr = aSig1>>( - shiftCount );
2698             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2699         }
2700         else {
2701             *zSig0Ptr = aSig1<<shiftCount;
2702             *zSig1Ptr = 0;
2703         }
2704         *zExpPtr = - shiftCount - 63;
2705     }
2706     else {
2707         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2708         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2709         *zExpPtr = 1 - shiftCount;
2710     }
2711
2712 }
2713
2714 /*----------------------------------------------------------------------------
2715 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2716 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2717 | floating-point value, returning the result.  After being shifted into the
2718 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2719 | added together to form the most significant 32 bits of the result.  This
2720 | means that any integer portion of `zSig0' will be added into the exponent.
2721 | Since a properly normalized significand will have an integer portion equal
2722 | to 1, the `zExp' input should be 1 less than the desired result exponent
2723 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2724 | significand.
2725 *----------------------------------------------------------------------------*/
2726
2727 static inline float128
2728  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2729 {
2730     float128 z;
2731
2732     z.low = zSig1;
2733     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2734     return z;
2735
2736 }
2737
2738 /*----------------------------------------------------------------------------
2739 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2740 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2741 | and `zSig2', and returns the proper quadruple-precision floating-point value
2742 | corresponding to the abstract input.  Ordinarily, the abstract value is
2743 | simply rounded and packed into the quadruple-precision format, with the
2744 | inexact exception raised if the abstract input cannot be represented
2745 | exactly.  However, if the abstract value is too large, the overflow and
2746 | inexact exceptions are raised and an infinity or maximal finite value is
2747 | returned.  If the abstract value is too small, the input value is rounded to
2748 | a subnormal number, and the underflow and inexact exceptions are raised if
2749 | the abstract input cannot be represented exactly as a subnormal quadruple-
2750 | precision floating-point number.
2751 |     The input significand must be normalized or smaller.  If the input
2752 | significand is not normalized, `zExp' must be 0; in that case, the result
2753 | returned is a subnormal number, and it must not require rounding.  In the
2754 | usual case that the input significand is normalized, `zExp' must be 1 less
2755 | than the ``true'' floating-point exponent.  The handling of underflow and
2756 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2757 *----------------------------------------------------------------------------*/
2758
2759 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2760                                      uint64_t zSig0, uint64_t zSig1,
2761                                      uint64_t zSig2, float_status *status)
2762 {
2763     int8_t roundingMode;
2764     flag roundNearestEven, increment, isTiny;
2765
2766     roundingMode = status->float_rounding_mode;
2767     roundNearestEven = ( roundingMode == float_round_nearest_even );
2768     switch (roundingMode) {
2769     case float_round_nearest_even:
2770     case float_round_ties_away:
2771         increment = ((int64_t)zSig2 < 0);
2772         break;
2773     case float_round_to_zero:
2774         increment = 0;
2775         break;
2776     case float_round_up:
2777         increment = !zSign && zSig2;
2778         break;
2779     case float_round_down:
2780         increment = zSign && zSig2;
2781         break;
2782     case float_round_to_odd:
2783         increment = !(zSig1 & 0x1) && zSig2;
2784         break;
2785     default:
2786         abort();
2787     }
2788     if ( 0x7FFD <= (uint32_t) zExp ) {
2789         if (    ( 0x7FFD < zExp )
2790              || (    ( zExp == 0x7FFD )
2791                   && eq128(
2792                          LIT64( 0x0001FFFFFFFFFFFF ),
2793                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2794                          zSig0,
2795                          zSig1
2796                      )
2797                   && increment
2798                 )
2799            ) {
2800             float_raise(float_flag_overflow | float_flag_inexact, status);
2801             if (    ( roundingMode == float_round_to_zero )
2802                  || ( zSign && ( roundingMode == float_round_up ) )
2803                  || ( ! zSign && ( roundingMode == float_round_down ) )
2804                  || (roundingMode == float_round_to_odd)
2805                ) {
2806                 return
2807                     packFloat128(
2808                         zSign,
2809                         0x7FFE,
2810                         LIT64( 0x0000FFFFFFFFFFFF ),
2811                         LIT64( 0xFFFFFFFFFFFFFFFF )
2812                     );
2813             }
2814             return packFloat128( zSign, 0x7FFF, 0, 0 );
2815         }
2816         if ( zExp < 0 ) {
2817             if (status->flush_to_zero) {
2818                 float_raise(float_flag_output_denormal, status);
2819                 return packFloat128(zSign, 0, 0, 0);
2820             }
2821             isTiny =
2822                    (status->float_detect_tininess
2823                     == float_tininess_before_rounding)
2824                 || ( zExp < -1 )
2825                 || ! increment
2826                 || lt128(
2827                        zSig0,
2828                        zSig1,
2829                        LIT64( 0x0001FFFFFFFFFFFF ),
2830                        LIT64( 0xFFFFFFFFFFFFFFFF )
2831                    );
2832             shift128ExtraRightJamming(
2833                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2834             zExp = 0;
2835             if (isTiny && zSig2) {
2836                 float_raise(float_flag_underflow, status);
2837             }
2838             switch (roundingMode) {
2839             case float_round_nearest_even:
2840             case float_round_ties_away:
2841                 increment = ((int64_t)zSig2 < 0);
2842                 break;
2843             case float_round_to_zero:
2844                 increment = 0;
2845                 break;
2846             case float_round_up:
2847                 increment = !zSign && zSig2;
2848                 break;
2849             case float_round_down:
2850                 increment = zSign && zSig2;
2851                 break;
2852             case float_round_to_odd:
2853                 increment = !(zSig1 & 0x1) && zSig2;
2854                 break;
2855             default:
2856                 abort();
2857             }
2858         }
2859     }
2860     if (zSig2) {
2861         status->float_exception_flags |= float_flag_inexact;
2862     }
2863     if ( increment ) {
2864         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2865         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2866     }
2867     else {
2868         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2869     }
2870     return packFloat128( zSign, zExp, zSig0, zSig1 );
2871
2872 }
2873
2874 /*----------------------------------------------------------------------------
2875 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2876 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2877 | returns the proper quadruple-precision floating-point value corresponding
2878 | to the abstract input.  This routine is just like `roundAndPackFloat128'
2879 | except that the input significand has fewer bits and does not have to be
2880 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
2881 | point exponent.
2882 *----------------------------------------------------------------------------*/
2883
2884 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2885                                               uint64_t zSig0, uint64_t zSig1,
2886                                               float_status *status)
2887 {
2888     int8_t shiftCount;
2889     uint64_t zSig2;
2890
2891     if ( zSig0 == 0 ) {
2892         zSig0 = zSig1;
2893         zSig1 = 0;
2894         zExp -= 64;
2895     }
2896     shiftCount = countLeadingZeros64( zSig0 ) - 15;
2897     if ( 0 <= shiftCount ) {
2898         zSig2 = 0;
2899         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2900     }
2901     else {
2902         shift128ExtraRightJamming(
2903             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
2904     }
2905     zExp -= shiftCount;
2906     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
2907
2908 }
2909
2910
2911 /*----------------------------------------------------------------------------
2912 | Returns the result of converting the 32-bit two's complement integer `a'
2913 | to the extended double-precision floating-point format.  The conversion
2914 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2915 | Arithmetic.
2916 *----------------------------------------------------------------------------*/
2917
2918 floatx80 int32_to_floatx80(int32_t a, float_status *status)
2919 {
2920     flag zSign;
2921     uint32_t absA;
2922     int8_t shiftCount;
2923     uint64_t zSig;
2924
2925     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2926     zSign = ( a < 0 );
2927     absA = zSign ? - a : a;
2928     shiftCount = countLeadingZeros32( absA ) + 32;
2929     zSig = absA;
2930     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
2931
2932 }
2933
2934 /*----------------------------------------------------------------------------
2935 | Returns the result of converting the 32-bit two's complement integer `a' to
2936 | the quadruple-precision floating-point format.  The conversion is performed
2937 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2938 *----------------------------------------------------------------------------*/
2939
2940 float128 int32_to_float128(int32_t a, float_status *status)
2941 {
2942     flag zSign;
2943     uint32_t absA;
2944     int8_t shiftCount;
2945     uint64_t zSig0;
2946
2947     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2948     zSign = ( a < 0 );
2949     absA = zSign ? - a : a;
2950     shiftCount = countLeadingZeros32( absA ) + 17;
2951     zSig0 = absA;
2952     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
2953
2954 }
2955
2956 /*----------------------------------------------------------------------------
2957 | Returns the result of converting the 64-bit two's complement integer `a'
2958 | to the extended double-precision floating-point format.  The conversion
2959 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2960 | Arithmetic.
2961 *----------------------------------------------------------------------------*/
2962
2963 floatx80 int64_to_floatx80(int64_t a, float_status *status)
2964 {
2965     flag zSign;
2966     uint64_t absA;
2967     int8_t shiftCount;
2968
2969     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
2970     zSign = ( a < 0 );
2971     absA = zSign ? - a : a;
2972     shiftCount = countLeadingZeros64( absA );
2973     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
2974
2975 }
2976
2977 /*----------------------------------------------------------------------------
2978 | Returns the result of converting the 64-bit two's complement integer `a' to
2979 | the quadruple-precision floating-point format.  The conversion is performed
2980 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2981 *----------------------------------------------------------------------------*/
2982
2983 float128 int64_to_float128(int64_t a, float_status *status)
2984 {
2985     flag zSign;
2986     uint64_t absA;
2987     int8_t shiftCount;
2988     int32_t zExp;
2989     uint64_t zSig0, zSig1;
2990
2991     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
2992     zSign = ( a < 0 );
2993     absA = zSign ? - a : a;
2994     shiftCount = countLeadingZeros64( absA ) + 49;
2995     zExp = 0x406E - shiftCount;
2996     if ( 64 <= shiftCount ) {
2997         zSig1 = 0;
2998         zSig0 = absA;
2999         shiftCount -= 64;
3000     }
3001     else {
3002         zSig1 = absA;
3003         zSig0 = 0;
3004     }
3005     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3006     return packFloat128( zSign, zExp, zSig0, zSig1 );
3007
3008 }
3009
3010 /*----------------------------------------------------------------------------
3011 | Returns the result of converting the 64-bit unsigned integer `a'
3012 | to the quadruple-precision floating-point format.  The conversion is performed
3013 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3014 *----------------------------------------------------------------------------*/
3015
3016 float128 uint64_to_float128(uint64_t a, float_status *status)
3017 {
3018     if (a == 0) {
3019         return float128_zero;
3020     }
3021     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3022 }
3023
3024
3025
3026
3027 /*----------------------------------------------------------------------------
3028 | Returns the result of converting the single-precision floating-point value
3029 | `a' to the double-precision floating-point format.  The conversion is
3030 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3031 | Arithmetic.
3032 *----------------------------------------------------------------------------*/
3033
3034 float64 float32_to_float64(float32 a, float_status *status)
3035 {
3036     flag aSign;
3037     int aExp;
3038     uint32_t aSig;
3039     a = float32_squash_input_denormal(a, status);
3040
3041     aSig = extractFloat32Frac( a );
3042     aExp = extractFloat32Exp( a );
3043     aSign = extractFloat32Sign( a );
3044     if ( aExp == 0xFF ) {
3045         if (aSig) {
3046             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3047         }
3048         return packFloat64( aSign, 0x7FF, 0 );
3049     }
3050     if ( aExp == 0 ) {
3051         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3052         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3053         --aExp;
3054     }
3055     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3056
3057 }
3058
3059 /*----------------------------------------------------------------------------
3060 | Returns the result of converting the single-precision floating-point value
3061 | `a' to the extended double-precision floating-point format.  The conversion
3062 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3063 | Arithmetic.
3064 *----------------------------------------------------------------------------*/
3065
3066 floatx80 float32_to_floatx80(float32 a, float_status *status)
3067 {
3068     flag aSign;
3069     int aExp;
3070     uint32_t aSig;
3071
3072     a = float32_squash_input_denormal(a, status);
3073     aSig = extractFloat32Frac( a );
3074     aExp = extractFloat32Exp( a );
3075     aSign = extractFloat32Sign( a );
3076     if ( aExp == 0xFF ) {
3077         if (aSig) {
3078             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3079         }
3080         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3081     }
3082     if ( aExp == 0 ) {
3083         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3084         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3085     }
3086     aSig |= 0x00800000;
3087     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3088
3089 }
3090
3091 /*----------------------------------------------------------------------------
3092 | Returns the result of converting the single-precision floating-point value
3093 | `a' to the double-precision floating-point format.  The conversion is
3094 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3095 | Arithmetic.
3096 *----------------------------------------------------------------------------*/
3097
3098 float128 float32_to_float128(float32 a, float_status *status)
3099 {
3100     flag aSign;
3101     int aExp;
3102     uint32_t aSig;
3103
3104     a = float32_squash_input_denormal(a, status);
3105     aSig = extractFloat32Frac( a );
3106     aExp = extractFloat32Exp( a );
3107     aSign = extractFloat32Sign( a );
3108     if ( aExp == 0xFF ) {
3109         if (aSig) {
3110             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3111         }
3112         return packFloat128( aSign, 0x7FFF, 0, 0 );
3113     }
3114     if ( aExp == 0 ) {
3115         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3116         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3117         --aExp;
3118     }
3119     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3120
3121 }
3122
3123 /*----------------------------------------------------------------------------
3124 | Returns the remainder of the single-precision floating-point value `a'
3125 | with respect to the corresponding value `b'.  The operation is performed
3126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3127 *----------------------------------------------------------------------------*/
3128
3129 float32 float32_rem(float32 a, float32 b, float_status *status)
3130 {
3131     flag aSign, zSign;
3132     int aExp, bExp, expDiff;
3133     uint32_t aSig, bSig;
3134     uint32_t q;
3135     uint64_t aSig64, bSig64, q64;
3136     uint32_t alternateASig;
3137     int32_t sigMean;
3138     a = float32_squash_input_denormal(a, status);
3139     b = float32_squash_input_denormal(b, status);
3140
3141     aSig = extractFloat32Frac( a );
3142     aExp = extractFloat32Exp( a );
3143     aSign = extractFloat32Sign( a );
3144     bSig = extractFloat32Frac( b );
3145     bExp = extractFloat32Exp( b );
3146     if ( aExp == 0xFF ) {
3147         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3148             return propagateFloat32NaN(a, b, status);
3149         }
3150         float_raise(float_flag_invalid, status);
3151         return float32_default_nan(status);
3152     }
3153     if ( bExp == 0xFF ) {
3154         if (bSig) {
3155             return propagateFloat32NaN(a, b, status);
3156         }
3157         return a;
3158     }
3159     if ( bExp == 0 ) {
3160         if ( bSig == 0 ) {
3161             float_raise(float_flag_invalid, status);
3162             return float32_default_nan(status);
3163         }
3164         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3165     }
3166     if ( aExp == 0 ) {
3167         if ( aSig == 0 ) return a;
3168         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3169     }
3170     expDiff = aExp - bExp;
3171     aSig |= 0x00800000;
3172     bSig |= 0x00800000;
3173     if ( expDiff < 32 ) {
3174         aSig <<= 8;
3175         bSig <<= 8;
3176         if ( expDiff < 0 ) {
3177             if ( expDiff < -1 ) return a;
3178             aSig >>= 1;
3179         }
3180         q = ( bSig <= aSig );
3181         if ( q ) aSig -= bSig;
3182         if ( 0 < expDiff ) {
3183             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3184             q >>= 32 - expDiff;
3185             bSig >>= 2;
3186             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3187         }
3188         else {
3189             aSig >>= 2;
3190             bSig >>= 2;
3191         }
3192     }
3193     else {
3194         if ( bSig <= aSig ) aSig -= bSig;
3195         aSig64 = ( (uint64_t) aSig )<<40;
3196         bSig64 = ( (uint64_t) bSig )<<40;
3197         expDiff -= 64;
3198         while ( 0 < expDiff ) {
3199             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3200             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3201             aSig64 = - ( ( bSig * q64 )<<38 );
3202             expDiff -= 62;
3203         }
3204         expDiff += 64;
3205         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3206         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3207         q = q64>>( 64 - expDiff );
3208         bSig <<= 6;
3209         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3210     }
3211     do {
3212         alternateASig = aSig;
3213         ++q;
3214         aSig -= bSig;
3215     } while ( 0 <= (int32_t) aSig );
3216     sigMean = aSig + alternateASig;
3217     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3218         aSig = alternateASig;
3219     }
3220     zSign = ( (int32_t) aSig < 0 );
3221     if ( zSign ) aSig = - aSig;
3222     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3223 }
3224
3225
3226 /*----------------------------------------------------------------------------
3227 | Returns the square root of the single-precision floating-point value `a'.
3228 | The operation is performed according to the IEC/IEEE Standard for Binary
3229 | Floating-Point Arithmetic.
3230 *----------------------------------------------------------------------------*/
3231
3232 float32 float32_sqrt(float32 a, float_status *status)
3233 {
3234     flag aSign;
3235     int aExp, zExp;
3236     uint32_t aSig, zSig;
3237     uint64_t rem, term;
3238     a = float32_squash_input_denormal(a, status);
3239
3240     aSig = extractFloat32Frac( a );
3241     aExp = extractFloat32Exp( a );
3242     aSign = extractFloat32Sign( a );
3243     if ( aExp == 0xFF ) {
3244         if (aSig) {
3245             return propagateFloat32NaN(a, float32_zero, status);
3246         }
3247         if ( ! aSign ) return a;
3248         float_raise(float_flag_invalid, status);
3249         return float32_default_nan(status);
3250     }
3251     if ( aSign ) {
3252         if ( ( aExp | aSig ) == 0 ) return a;
3253         float_raise(float_flag_invalid, status);
3254         return float32_default_nan(status);
3255     }
3256     if ( aExp == 0 ) {
3257         if ( aSig == 0 ) return float32_zero;
3258         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3259     }
3260     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
3261     aSig = ( aSig | 0x00800000 )<<8;
3262     zSig = estimateSqrt32( aExp, aSig ) + 2;
3263     if ( ( zSig & 0x7F ) <= 5 ) {
3264         if ( zSig < 2 ) {
3265             zSig = 0x7FFFFFFF;
3266             goto roundAndPack;
3267         }
3268         aSig >>= aExp & 1;
3269         term = ( (uint64_t) zSig ) * zSig;
3270         rem = ( ( (uint64_t) aSig )<<32 ) - term;
3271         while ( (int64_t) rem < 0 ) {
3272             --zSig;
3273             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
3274         }
3275         zSig |= ( rem != 0 );
3276     }
3277     shift32RightJamming( zSig, 1, &zSig );
3278  roundAndPack:
3279     return roundAndPackFloat32(0, zExp, zSig, status);
3280
3281 }
3282
3283 /*----------------------------------------------------------------------------
3284 | Returns the binary exponential of the single-precision floating-point value
3285 | `a'. The operation is performed according to the IEC/IEEE Standard for
3286 | Binary Floating-Point Arithmetic.
3287 |
3288 | Uses the following identities:
3289 |
3290 | 1. -------------------------------------------------------------------------
3291 |      x    x*ln(2)
3292 |     2  = e
3293 |
3294 | 2. -------------------------------------------------------------------------
3295 |                      2     3     4     5           n
3296 |      x        x     x     x     x     x           x
3297 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3298 |               1!    2!    3!    4!    5!          n!
3299 *----------------------------------------------------------------------------*/
3300
3301 static const float64 float32_exp2_coefficients[15] =
3302 {
3303     const_float64( 0x3ff0000000000000ll ), /*  1 */
3304     const_float64( 0x3fe0000000000000ll ), /*  2 */
3305     const_float64( 0x3fc5555555555555ll ), /*  3 */
3306     const_float64( 0x3fa5555555555555ll ), /*  4 */
3307     const_float64( 0x3f81111111111111ll ), /*  5 */
3308     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3309     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3310     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3311     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3312     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3313     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3314     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3315     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3316     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3317     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3318 };
3319
3320 float32 float32_exp2(float32 a, float_status *status)
3321 {
3322     flag aSign;
3323     int aExp;
3324     uint32_t aSig;
3325     float64 r, x, xn;
3326     int i;
3327     a = float32_squash_input_denormal(a, status);
3328
3329     aSig = extractFloat32Frac( a );
3330     aExp = extractFloat32Exp( a );
3331     aSign = extractFloat32Sign( a );
3332
3333     if ( aExp == 0xFF) {
3334         if (aSig) {
3335             return propagateFloat32NaN(a, float32_zero, status);
3336         }
3337         return (aSign) ? float32_zero : a;
3338     }
3339     if (aExp == 0) {
3340         if (aSig == 0) return float32_one;
3341     }
3342
3343     float_raise(float_flag_inexact, status);
3344
3345     /* ******************************* */
3346     /* using float64 for approximation */
3347     /* ******************************* */
3348     x = float32_to_float64(a, status);
3349     x = float64_mul(x, float64_ln2, status);
3350
3351     xn = x;
3352     r = float64_one;
3353     for (i = 0 ; i < 15 ; i++) {
3354         float64 f;
3355
3356         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3357         r = float64_add(r, f, status);
3358
3359         xn = float64_mul(xn, x, status);
3360     }
3361
3362     return float64_to_float32(r, status);
3363 }
3364
3365 /*----------------------------------------------------------------------------
3366 | Returns the binary log of the single-precision floating-point value `a'.
3367 | The operation is performed according to the IEC/IEEE Standard for Binary
3368 | Floating-Point Arithmetic.
3369 *----------------------------------------------------------------------------*/
3370 float32 float32_log2(float32 a, float_status *status)
3371 {
3372     flag aSign, zSign;
3373     int aExp;
3374     uint32_t aSig, zSig, i;
3375
3376     a = float32_squash_input_denormal(a, status);
3377     aSig = extractFloat32Frac( a );
3378     aExp = extractFloat32Exp( a );
3379     aSign = extractFloat32Sign( a );
3380
3381     if ( aExp == 0 ) {
3382         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3383         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3384     }
3385     if ( aSign ) {
3386         float_raise(float_flag_invalid, status);
3387         return float32_default_nan(status);
3388     }
3389     if ( aExp == 0xFF ) {
3390         if (aSig) {
3391             return propagateFloat32NaN(a, float32_zero, status);
3392         }
3393         return a;
3394     }
3395
3396     aExp -= 0x7F;
3397     aSig |= 0x00800000;
3398     zSign = aExp < 0;
3399     zSig = aExp << 23;
3400
3401     for (i = 1 << 22; i > 0; i >>= 1) {
3402         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3403         if ( aSig & 0x01000000 ) {
3404             aSig >>= 1;
3405             zSig |= i;
3406         }
3407     }
3408
3409     if ( zSign )
3410         zSig = -zSig;
3411
3412     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3413 }
3414
3415 /*----------------------------------------------------------------------------
3416 | Returns 1 if the single-precision floating-point value `a' is equal to
3417 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3418 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3419 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3420 *----------------------------------------------------------------------------*/
3421
3422 int float32_eq(float32 a, float32 b, float_status *status)
3423 {
3424     uint32_t av, bv;
3425     a = float32_squash_input_denormal(a, status);
3426     b = float32_squash_input_denormal(b, status);
3427
3428     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3429          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3430        ) {
3431         float_raise(float_flag_invalid, status);
3432         return 0;
3433     }
3434     av = float32_val(a);
3435     bv = float32_val(b);
3436     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3437 }
3438
3439 /*----------------------------------------------------------------------------
3440 | Returns 1 if the single-precision floating-point value `a' is less than
3441 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3442 | exception is raised if either operand is a NaN.  The comparison is performed
3443 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3444 *----------------------------------------------------------------------------*/
3445
3446 int float32_le(float32 a, float32 b, float_status *status)
3447 {
3448     flag aSign, bSign;
3449     uint32_t av, bv;
3450     a = float32_squash_input_denormal(a, status);
3451     b = float32_squash_input_denormal(b, status);
3452
3453     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3454          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3455        ) {
3456         float_raise(float_flag_invalid, status);
3457         return 0;
3458     }
3459     aSign = extractFloat32Sign( a );
3460     bSign = extractFloat32Sign( b );
3461     av = float32_val(a);
3462     bv = float32_val(b);
3463     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3464     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3465
3466 }
3467
3468 /*----------------------------------------------------------------------------
3469 | Returns 1 if the single-precision floating-point value `a' is less than
3470 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3471 | raised if either operand is a NaN.  The comparison is performed according
3472 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3473 *----------------------------------------------------------------------------*/
3474
3475 int float32_lt(float32 a, float32 b, float_status *status)
3476 {
3477     flag aSign, bSign;
3478     uint32_t av, bv;
3479     a = float32_squash_input_denormal(a, status);
3480     b = float32_squash_input_denormal(b, status);
3481
3482     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3483          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3484        ) {
3485         float_raise(float_flag_invalid, status);
3486         return 0;
3487     }
3488     aSign = extractFloat32Sign( a );
3489     bSign = extractFloat32Sign( b );
3490     av = float32_val(a);
3491     bv = float32_val(b);
3492     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3493     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3494
3495 }
3496
3497 /*----------------------------------------------------------------------------
3498 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3499 | be compared, and 0 otherwise.  The invalid exception is raised if either
3500 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3501 | Standard for Binary Floating-Point Arithmetic.
3502 *----------------------------------------------------------------------------*/
3503
3504 int float32_unordered(float32 a, float32 b, float_status *status)
3505 {
3506     a = float32_squash_input_denormal(a, status);
3507     b = float32_squash_input_denormal(b, status);
3508
3509     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3510          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3511        ) {
3512         float_raise(float_flag_invalid, status);
3513         return 1;
3514     }
3515     return 0;
3516 }
3517
3518 /*----------------------------------------------------------------------------
3519 | Returns 1 if the single-precision floating-point value `a' is equal to
3520 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3521 | exception.  The comparison is performed according to the IEC/IEEE Standard
3522 | for Binary Floating-Point Arithmetic.
3523 *----------------------------------------------------------------------------*/
3524
3525 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3526 {
3527     a = float32_squash_input_denormal(a, status);
3528     b = float32_squash_input_denormal(b, status);
3529
3530     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3531          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3532        ) {
3533         if (float32_is_signaling_nan(a, status)
3534          || float32_is_signaling_nan(b, status)) {
3535             float_raise(float_flag_invalid, status);
3536         }
3537         return 0;
3538     }
3539     return ( float32_val(a) == float32_val(b) ) ||
3540             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3541 }
3542
3543 /*----------------------------------------------------------------------------
3544 | Returns 1 if the single-precision floating-point value `a' is less than or
3545 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3546 | cause an exception.  Otherwise, the comparison is performed according to the
3547 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3548 *----------------------------------------------------------------------------*/
3549
3550 int float32_le_quiet(float32 a, float32 b, float_status *status)
3551 {
3552     flag aSign, bSign;
3553     uint32_t av, bv;
3554     a = float32_squash_input_denormal(a, status);
3555     b = float32_squash_input_denormal(b, status);
3556
3557     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3558          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3559        ) {
3560         if (float32_is_signaling_nan(a, status)
3561          || float32_is_signaling_nan(b, status)) {
3562             float_raise(float_flag_invalid, status);
3563         }
3564         return 0;
3565     }
3566     aSign = extractFloat32Sign( a );
3567     bSign = extractFloat32Sign( b );
3568     av = float32_val(a);
3569     bv = float32_val(b);
3570     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3571     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3572
3573 }
3574
3575 /*----------------------------------------------------------------------------
3576 | Returns 1 if the single-precision floating-point value `a' is less than
3577 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3578 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3579 | Standard for Binary Floating-Point Arithmetic.
3580 *----------------------------------------------------------------------------*/
3581
3582 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3583 {
3584     flag aSign, bSign;
3585     uint32_t av, bv;
3586     a = float32_squash_input_denormal(a, status);
3587     b = float32_squash_input_denormal(b, status);
3588
3589     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3590          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3591        ) {
3592         if (float32_is_signaling_nan(a, status)
3593          || float32_is_signaling_nan(b, status)) {
3594             float_raise(float_flag_invalid, status);
3595         }
3596         return 0;
3597     }
3598     aSign = extractFloat32Sign( a );
3599     bSign = extractFloat32Sign( b );
3600     av = float32_val(a);
3601     bv = float32_val(b);
3602     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3603     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3604
3605 }
3606
3607 /*----------------------------------------------------------------------------
3608 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3609 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3610 | comparison is performed according to the IEC/IEEE Standard for Binary
3611 | Floating-Point Arithmetic.
3612 *----------------------------------------------------------------------------*/
3613
3614 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3615 {
3616     a = float32_squash_input_denormal(a, status);
3617     b = float32_squash_input_denormal(b, status);
3618
3619     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3620          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3621        ) {
3622         if (float32_is_signaling_nan(a, status)
3623          || float32_is_signaling_nan(b, status)) {
3624             float_raise(float_flag_invalid, status);
3625         }
3626         return 1;
3627     }
3628     return 0;
3629 }
3630
3631
3632 /*----------------------------------------------------------------------------
3633 | Returns the result of converting the double-precision floating-point value
3634 | `a' to the single-precision floating-point format.  The conversion is
3635 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3636 | Arithmetic.
3637 *----------------------------------------------------------------------------*/
3638
3639 float32 float64_to_float32(float64 a, float_status *status)
3640 {
3641     flag aSign;
3642     int aExp;
3643     uint64_t aSig;
3644     uint32_t zSig;
3645     a = float64_squash_input_denormal(a, status);
3646
3647     aSig = extractFloat64Frac( a );
3648     aExp = extractFloat64Exp( a );
3649     aSign = extractFloat64Sign( a );
3650     if ( aExp == 0x7FF ) {
3651         if (aSig) {
3652             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3653         }
3654         return packFloat32( aSign, 0xFF, 0 );
3655     }
3656     shift64RightJamming( aSig, 22, &aSig );
3657     zSig = aSig;
3658     if ( aExp || zSig ) {
3659         zSig |= 0x40000000;
3660         aExp -= 0x381;
3661     }
3662     return roundAndPackFloat32(aSign, aExp, zSig, status);
3663
3664 }
3665
3666
3667 /*----------------------------------------------------------------------------
3668 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3669 | half-precision floating-point value, returning the result.  After being
3670 | shifted into the proper positions, the three fields are simply added
3671 | together to form the result.  This means that any integer portion of `zSig'
3672 | will be added into the exponent.  Since a properly normalized significand
3673 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3674 | than the desired result exponent whenever `zSig' is a complete, normalized
3675 | significand.
3676 *----------------------------------------------------------------------------*/
3677 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3678 {
3679     return make_float16(
3680         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3681 }
3682
3683 /*----------------------------------------------------------------------------
3684 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3685 | and significand `zSig', and returns the proper half-precision floating-
3686 | point value corresponding to the abstract input.  Ordinarily, the abstract
3687 | value is simply rounded and packed into the half-precision format, with
3688 | the inexact exception raised if the abstract input cannot be represented
3689 | exactly.  However, if the abstract value is too large, the overflow and
3690 | inexact exceptions are raised and an infinity or maximal finite value is
3691 | returned.  If the abstract value is too small, the input value is rounded to
3692 | a subnormal number, and the underflow and inexact exceptions are raised if
3693 | the abstract input cannot be represented exactly as a subnormal half-
3694 | precision floating-point number.
3695 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3696 | ARM-style "alternative representation", which omits the NaN and Inf
3697 | encodings in order to raise the maximum representable exponent by one.
3698 |     The input significand `zSig' has its binary point between bits 22
3699 | and 23, which is 13 bits to the left of the usual location.  This shifted
3700 | significand must be normalized or smaller.  If `zSig' is not normalized,
3701 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3702 | and it must not require rounding.  In the usual case that `zSig' is
3703 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3704 | Note the slightly odd position of the binary point in zSig compared with the
3705 | other roundAndPackFloat functions. This should probably be fixed if we
3706 | need to implement more float16 routines than just conversion.
3707 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3708 | Binary Floating-Point Arithmetic.
3709 *----------------------------------------------------------------------------*/
3710
3711 static float16 roundAndPackFloat16(flag zSign, int zExp,
3712                                    uint32_t zSig, flag ieee,
3713                                    float_status *status)
3714 {
3715     int maxexp = ieee ? 29 : 30;
3716     uint32_t mask;
3717     uint32_t increment;
3718     bool rounding_bumps_exp;
3719     bool is_tiny = false;
3720
3721     /* Calculate the mask of bits of the mantissa which are not
3722      * representable in half-precision and will be lost.
3723      */
3724     if (zExp < 1) {
3725         /* Will be denormal in halfprec */
3726         mask = 0x00ffffff;
3727         if (zExp >= -11) {
3728             mask >>= 11 + zExp;
3729         }
3730     } else {
3731         /* Normal number in halfprec */
3732         mask = 0x00001fff;
3733     }
3734
3735     switch (status->float_rounding_mode) {
3736     case float_round_nearest_even:
3737         increment = (mask + 1) >> 1;
3738         if ((zSig & mask) == increment) {
3739             increment = zSig & (increment << 1);
3740         }
3741         break;
3742     case float_round_ties_away:
3743         increment = (mask + 1) >> 1;
3744         break;
3745     case float_round_up:
3746         increment = zSign ? 0 : mask;
3747         break;
3748     case float_round_down:
3749         increment = zSign ? mask : 0;
3750         break;
3751     default: /* round_to_zero */
3752         increment = 0;
3753         break;
3754     }
3755
3756     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3757
3758     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3759         if (ieee) {
3760             float_raise(float_flag_overflow | float_flag_inexact, status);
3761             return packFloat16(zSign, 0x1f, 0);
3762         } else {
3763             float_raise(float_flag_invalid, status);
3764             return packFloat16(zSign, 0x1f, 0x3ff);
3765         }
3766     }
3767
3768     if (zExp < 0) {
3769         /* Note that flush-to-zero does not affect half-precision results */
3770         is_tiny =
3771             (status->float_detect_tininess == float_tininess_before_rounding)
3772             || (zExp < -1)
3773             || (!rounding_bumps_exp);
3774     }
3775     if (zSig & mask) {
3776         float_raise(float_flag_inexact, status);
3777         if (is_tiny) {
3778             float_raise(float_flag_underflow, status);
3779         }
3780     }
3781
3782     zSig += increment;
3783     if (rounding_bumps_exp) {
3784         zSig >>= 1;
3785         zExp++;
3786     }
3787
3788     if (zExp < -10) {
3789         return packFloat16(zSign, 0, 0);
3790     }
3791     if (zExp < 0) {
3792         zSig >>= -zExp;
3793         zExp = 0;
3794     }
3795     return packFloat16(zSign, zExp, zSig >> 13);
3796 }
3797
3798 /*----------------------------------------------------------------------------
3799 | If `a' is denormal and we are in flush-to-zero mode then set the
3800 | input-denormal exception and return zero. Otherwise just return the value.
3801 *----------------------------------------------------------------------------*/
3802 float16 float16_squash_input_denormal(float16 a, float_status *status)
3803 {
3804     if (status->flush_inputs_to_zero) {
3805         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3806             float_raise(float_flag_input_denormal, status);
3807             return make_float16(float16_val(a) & 0x8000);
3808         }
3809     }
3810     return a;
3811 }
3812
3813 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3814                                       uint32_t *zSigPtr)
3815 {
3816     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3817     *zSigPtr = aSig << shiftCount;
3818     *zExpPtr = 1 - shiftCount;
3819 }
3820
3821 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3822    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3823
3824 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3825 {
3826     flag aSign;
3827     int aExp;
3828     uint32_t aSig;
3829
3830     aSign = extractFloat16Sign(a);
3831     aExp = extractFloat16Exp(a);
3832     aSig = extractFloat16Frac(a);
3833
3834     if (aExp == 0x1f && ieee) {
3835         if (aSig) {
3836             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3837         }
3838         return packFloat32(aSign, 0xff, 0);
3839     }
3840     if (aExp == 0) {
3841         if (aSig == 0) {
3842             return packFloat32(aSign, 0, 0);
3843         }
3844
3845         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3846         aExp--;
3847     }
3848     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3849 }
3850
3851 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3852 {
3853     flag aSign;
3854     int aExp;
3855     uint32_t aSig;
3856
3857     a = float32_squash_input_denormal(a, status);
3858
3859     aSig = extractFloat32Frac( a );
3860     aExp = extractFloat32Exp( a );
3861     aSign = extractFloat32Sign( a );
3862     if ( aExp == 0xFF ) {
3863         if (aSig) {
3864             /* Input is a NaN */
3865             if (!ieee) {
3866                 float_raise(float_flag_invalid, status);
3867                 return packFloat16(aSign, 0, 0);
3868             }
3869             return commonNaNToFloat16(
3870                 float32ToCommonNaN(a, status), status);
3871         }
3872         /* Infinity */
3873         if (!ieee) {
3874             float_raise(float_flag_invalid, status);
3875             return packFloat16(aSign, 0x1f, 0x3ff);
3876         }
3877         return packFloat16(aSign, 0x1f, 0);
3878     }
3879     if (aExp == 0 && aSig == 0) {
3880         return packFloat16(aSign, 0, 0);
3881     }
3882     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3883      * even if the input is denormal; however this is harmless because
3884      * the largest possible single-precision denormal is still smaller
3885      * than the smallest representable half-precision denormal, and so we
3886      * will end up ignoring aSig and returning via the "always return zero"
3887      * codepath.
3888      */
3889     aSig |= 0x00800000;
3890     aExp -= 0x71;
3891
3892     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3893 }
3894
3895 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3896 {
3897     flag aSign;
3898     int aExp;
3899     uint32_t aSig;
3900
3901     aSign = extractFloat16Sign(a);
3902     aExp = extractFloat16Exp(a);
3903     aSig = extractFloat16Frac(a);
3904
3905     if (aExp == 0x1f && ieee) {
3906         if (aSig) {
3907             return commonNaNToFloat64(
3908                 float16ToCommonNaN(a, status), status);
3909         }
3910         return packFloat64(aSign, 0x7ff, 0);
3911     }
3912     if (aExp == 0) {
3913         if (aSig == 0) {
3914             return packFloat64(aSign, 0, 0);
3915         }
3916
3917         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3918         aExp--;
3919     }
3920     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3921 }
3922
3923 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3924 {
3925     flag aSign;
3926     int aExp;
3927     uint64_t aSig;
3928     uint32_t zSig;
3929
3930     a = float64_squash_input_denormal(a, status);
3931
3932     aSig = extractFloat64Frac(a);
3933     aExp = extractFloat64Exp(a);
3934     aSign = extractFloat64Sign(a);
3935     if (aExp == 0x7FF) {
3936         if (aSig) {
3937             /* Input is a NaN */
3938             if (!ieee) {
3939                 float_raise(float_flag_invalid, status);
3940                 return packFloat16(aSign, 0, 0);
3941             }
3942             return commonNaNToFloat16(
3943                 float64ToCommonNaN(a, status), status);
3944         }
3945         /* Infinity */
3946         if (!ieee) {
3947             float_raise(float_flag_invalid, status);
3948             return packFloat16(aSign, 0x1f, 0x3ff);
3949         }
3950         return packFloat16(aSign, 0x1f, 0);
3951     }
3952     shift64RightJamming(aSig, 29, &aSig);
3953     zSig = aSig;
3954     if (aExp == 0 && zSig == 0) {
3955         return packFloat16(aSign, 0, 0);
3956     }
3957     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3958      * even if the input is denormal; however this is harmless because
3959      * the largest possible single-precision denormal is still smaller
3960      * than the smallest representable half-precision denormal, and so we
3961      * will end up ignoring aSig and returning via the "always return zero"
3962      * codepath.
3963      */
3964     zSig |= 0x00800000;
3965     aExp -= 0x3F1;
3966
3967     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3968 }
3969
3970 /*----------------------------------------------------------------------------
3971 | Returns the result of converting the double-precision floating-point value
3972 | `a' to the extended double-precision floating-point format.  The conversion
3973 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3974 | Arithmetic.
3975 *----------------------------------------------------------------------------*/
3976
3977 floatx80 float64_to_floatx80(float64 a, float_status *status)
3978 {
3979     flag aSign;
3980     int aExp;
3981     uint64_t aSig;
3982
3983     a = float64_squash_input_denormal(a, status);
3984     aSig = extractFloat64Frac( a );
3985     aExp = extractFloat64Exp( a );
3986     aSign = extractFloat64Sign( a );
3987     if ( aExp == 0x7FF ) {
3988         if (aSig) {
3989             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3990         }
3991         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3992     }
3993     if ( aExp == 0 ) {
3994         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3995         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3996     }
3997     return
3998         packFloatx80(
3999             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4000
4001 }
4002
4003 /*----------------------------------------------------------------------------
4004 | Returns the result of converting the double-precision floating-point value
4005 | `a' to the quadruple-precision floating-point format.  The conversion is
4006 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4007 | Arithmetic.
4008 *----------------------------------------------------------------------------*/
4009
4010 float128 float64_to_float128(float64 a, float_status *status)
4011 {
4012     flag aSign;
4013     int aExp;
4014     uint64_t aSig, zSig0, zSig1;
4015
4016     a = float64_squash_input_denormal(a, status);
4017     aSig = extractFloat64Frac( a );
4018     aExp = extractFloat64Exp( a );
4019     aSign = extractFloat64Sign( a );
4020     if ( aExp == 0x7FF ) {
4021         if (aSig) {
4022             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4023         }
4024         return packFloat128( aSign, 0x7FFF, 0, 0 );
4025     }
4026     if ( aExp == 0 ) {
4027         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4028         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4029         --aExp;
4030     }
4031     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4032     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4033
4034 }
4035
4036
4037 /*----------------------------------------------------------------------------
4038 | Returns the remainder of the double-precision floating-point value `a'
4039 | with respect to the corresponding value `b'.  The operation is performed
4040 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4041 *----------------------------------------------------------------------------*/
4042
4043 float64 float64_rem(float64 a, float64 b, float_status *status)
4044 {
4045     flag aSign, zSign;
4046     int aExp, bExp, expDiff;
4047     uint64_t aSig, bSig;
4048     uint64_t q, alternateASig;
4049     int64_t sigMean;
4050
4051     a = float64_squash_input_denormal(a, status);
4052     b = float64_squash_input_denormal(b, status);
4053     aSig = extractFloat64Frac( a );
4054     aExp = extractFloat64Exp( a );
4055     aSign = extractFloat64Sign( a );
4056     bSig = extractFloat64Frac( b );
4057     bExp = extractFloat64Exp( b );
4058     if ( aExp == 0x7FF ) {
4059         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4060             return propagateFloat64NaN(a, b, status);
4061         }
4062         float_raise(float_flag_invalid, status);
4063         return float64_default_nan(status);
4064     }
4065     if ( bExp == 0x7FF ) {
4066         if (bSig) {
4067             return propagateFloat64NaN(a, b, status);
4068         }
4069         return a;
4070     }
4071     if ( bExp == 0 ) {
4072         if ( bSig == 0 ) {
4073             float_raise(float_flag_invalid, status);
4074             return float64_default_nan(status);
4075         }
4076         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4077     }
4078     if ( aExp == 0 ) {
4079         if ( aSig == 0 ) return a;
4080         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4081     }
4082     expDiff = aExp - bExp;
4083     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4084     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4085     if ( expDiff < 0 ) {
4086         if ( expDiff < -1 ) return a;
4087         aSig >>= 1;
4088     }
4089     q = ( bSig <= aSig );
4090     if ( q ) aSig -= bSig;
4091     expDiff -= 64;
4092     while ( 0 < expDiff ) {
4093         q = estimateDiv128To64( aSig, 0, bSig );
4094         q = ( 2 < q ) ? q - 2 : 0;
4095         aSig = - ( ( bSig>>2 ) * q );
4096         expDiff -= 62;
4097     }
4098     expDiff += 64;
4099     if ( 0 < expDiff ) {
4100         q = estimateDiv128To64( aSig, 0, bSig );
4101         q = ( 2 < q ) ? q - 2 : 0;
4102         q >>= 64 - expDiff;
4103         bSig >>= 2;
4104         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4105     }
4106     else {
4107         aSig >>= 2;
4108         bSig >>= 2;
4109     }
4110     do {
4111         alternateASig = aSig;
4112         ++q;
4113         aSig -= bSig;
4114     } while ( 0 <= (int64_t) aSig );
4115     sigMean = aSig + alternateASig;
4116     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4117         aSig = alternateASig;
4118     }
4119     zSign = ( (int64_t) aSig < 0 );
4120     if ( zSign ) aSig = - aSig;
4121     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4122
4123 }
4124
4125
4126 /*----------------------------------------------------------------------------
4127 | Returns the square root of the double-precision floating-point value `a'.
4128 | The operation is performed according to the IEC/IEEE Standard for Binary
4129 | Floating-Point Arithmetic.
4130 *----------------------------------------------------------------------------*/
4131
4132 float64 float64_sqrt(float64 a, float_status *status)
4133 {
4134     flag aSign;
4135     int aExp, zExp;
4136     uint64_t aSig, zSig, doubleZSig;
4137     uint64_t rem0, rem1, term0, term1;
4138     a = float64_squash_input_denormal(a, status);
4139
4140     aSig = extractFloat64Frac( a );
4141     aExp = extractFloat64Exp( a );
4142     aSign = extractFloat64Sign( a );
4143     if ( aExp == 0x7FF ) {
4144         if (aSig) {
4145             return propagateFloat64NaN(a, a, status);
4146         }
4147         if ( ! aSign ) return a;
4148         float_raise(float_flag_invalid, status);
4149         return float64_default_nan(status);
4150     }
4151     if ( aSign ) {
4152         if ( ( aExp | aSig ) == 0 ) return a;
4153         float_raise(float_flag_invalid, status);
4154         return float64_default_nan(status);
4155     }
4156     if ( aExp == 0 ) {
4157         if ( aSig == 0 ) return float64_zero;
4158         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4159     }
4160     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4161     aSig |= LIT64( 0x0010000000000000 );
4162     zSig = estimateSqrt32( aExp, aSig>>21 );
4163     aSig <<= 9 - ( aExp & 1 );
4164     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4165     if ( ( zSig & 0x1FF ) <= 5 ) {
4166         doubleZSig = zSig<<1;
4167         mul64To128( zSig, zSig, &term0, &term1 );
4168         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4169         while ( (int64_t) rem0 < 0 ) {
4170             --zSig;
4171             doubleZSig -= 2;
4172             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4173         }
4174         zSig |= ( ( rem0 | rem1 ) != 0 );
4175     }
4176     return roundAndPackFloat64(0, zExp, zSig, status);
4177
4178 }
4179
4180 /*----------------------------------------------------------------------------
4181 | Returns the binary log of the double-precision floating-point value `a'.
4182 | The operation is performed according to the IEC/IEEE Standard for Binary
4183 | Floating-Point Arithmetic.
4184 *----------------------------------------------------------------------------*/
4185 float64 float64_log2(float64 a, float_status *status)
4186 {
4187     flag aSign, zSign;
4188     int aExp;
4189     uint64_t aSig, aSig0, aSig1, zSig, i;
4190     a = float64_squash_input_denormal(a, status);
4191
4192     aSig = extractFloat64Frac( a );
4193     aExp = extractFloat64Exp( a );
4194     aSign = extractFloat64Sign( a );
4195
4196     if ( aExp == 0 ) {
4197         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4198         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4199     }
4200     if ( aSign ) {
4201         float_raise(float_flag_invalid, status);
4202         return float64_default_nan(status);
4203     }
4204     if ( aExp == 0x7FF ) {
4205         if (aSig) {
4206             return propagateFloat64NaN(a, float64_zero, status);
4207         }
4208         return a;
4209     }
4210
4211     aExp -= 0x3FF;
4212     aSig |= LIT64( 0x0010000000000000 );
4213     zSign = aExp < 0;
4214     zSig = (uint64_t)aExp << 52;
4215     for (i = 1LL << 51; i > 0; i >>= 1) {
4216         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4217         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4218         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4219             aSig >>= 1;
4220             zSig |= i;
4221         }
4222     }
4223
4224     if ( zSign )
4225         zSig = -zSig;
4226     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4227 }
4228
4229 /*----------------------------------------------------------------------------
4230 | Returns 1 if the double-precision floating-point value `a' is equal to the
4231 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4232 | if either operand is a NaN.  Otherwise, the comparison is performed
4233 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4234 *----------------------------------------------------------------------------*/
4235
4236 int float64_eq(float64 a, float64 b, float_status *status)
4237 {
4238     uint64_t av, bv;
4239     a = float64_squash_input_denormal(a, status);
4240     b = float64_squash_input_denormal(b, status);
4241
4242     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4243          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4244        ) {
4245         float_raise(float_flag_invalid, status);
4246         return 0;
4247     }
4248     av = float64_val(a);
4249     bv = float64_val(b);
4250     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4251
4252 }
4253
4254 /*----------------------------------------------------------------------------
4255 | Returns 1 if the double-precision floating-point value `a' is less than or
4256 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4257 | exception is raised if either operand is a NaN.  The comparison is performed
4258 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4259 *----------------------------------------------------------------------------*/
4260
4261 int float64_le(float64 a, float64 b, float_status *status)
4262 {
4263     flag aSign, bSign;
4264     uint64_t av, bv;
4265     a = float64_squash_input_denormal(a, status);
4266     b = float64_squash_input_denormal(b, status);
4267
4268     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4269          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4270        ) {
4271         float_raise(float_flag_invalid, status);
4272         return 0;
4273     }
4274     aSign = extractFloat64Sign( a );
4275     bSign = extractFloat64Sign( b );
4276     av = float64_val(a);
4277     bv = float64_val(b);
4278     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4279     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4280
4281 }
4282
4283 /*----------------------------------------------------------------------------
4284 | Returns 1 if the double-precision floating-point value `a' is less than
4285 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4286 | raised if either operand is a NaN.  The comparison is performed according
4287 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4288 *----------------------------------------------------------------------------*/
4289
4290 int float64_lt(float64 a, float64 b, float_status *status)
4291 {
4292     flag aSign, bSign;
4293     uint64_t av, bv;
4294
4295     a = float64_squash_input_denormal(a, status);
4296     b = float64_squash_input_denormal(b, status);
4297     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4298          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4299        ) {
4300         float_raise(float_flag_invalid, status);
4301         return 0;
4302     }
4303     aSign = extractFloat64Sign( a );
4304     bSign = extractFloat64Sign( b );
4305     av = float64_val(a);
4306     bv = float64_val(b);
4307     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4308     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4309
4310 }
4311
4312 /*----------------------------------------------------------------------------
4313 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4314 | be compared, and 0 otherwise.  The invalid exception is raised if either
4315 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4316 | Standard for Binary Floating-Point Arithmetic.
4317 *----------------------------------------------------------------------------*/
4318
4319 int float64_unordered(float64 a, float64 b, float_status *status)
4320 {
4321     a = float64_squash_input_denormal(a, status);
4322     b = float64_squash_input_denormal(b, status);
4323
4324     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4325          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4326        ) {
4327         float_raise(float_flag_invalid, status);
4328         return 1;
4329     }
4330     return 0;
4331 }
4332
4333 /*----------------------------------------------------------------------------
4334 | Returns 1 if the double-precision floating-point value `a' is equal to the
4335 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4336 | exception.The comparison is performed according to the IEC/IEEE Standard
4337 | for Binary Floating-Point Arithmetic.
4338 *----------------------------------------------------------------------------*/
4339
4340 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4341 {
4342     uint64_t av, bv;
4343     a = float64_squash_input_denormal(a, status);
4344     b = float64_squash_input_denormal(b, status);
4345
4346     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4347          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4348        ) {
4349         if (float64_is_signaling_nan(a, status)
4350          || float64_is_signaling_nan(b, status)) {
4351             float_raise(float_flag_invalid, status);
4352         }
4353         return 0;
4354     }
4355     av = float64_val(a);
4356     bv = float64_val(b);
4357     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4358
4359 }
4360
4361 /*----------------------------------------------------------------------------
4362 | Returns 1 if the double-precision floating-point value `a' is less than or
4363 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4364 | cause an exception.  Otherwise, the comparison is performed according to the
4365 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4366 *----------------------------------------------------------------------------*/
4367
4368 int float64_le_quiet(float64 a, float64 b, float_status *status)
4369 {
4370     flag aSign, bSign;
4371     uint64_t av, bv;
4372     a = float64_squash_input_denormal(a, status);
4373     b = float64_squash_input_denormal(b, status);
4374
4375     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4376          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4377        ) {
4378         if (float64_is_signaling_nan(a, status)
4379          || float64_is_signaling_nan(b, status)) {
4380             float_raise(float_flag_invalid, status);
4381         }
4382         return 0;
4383     }
4384     aSign = extractFloat64Sign( a );
4385     bSign = extractFloat64Sign( b );
4386     av = float64_val(a);
4387     bv = float64_val(b);
4388     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4389     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4390
4391 }
4392
4393 /*----------------------------------------------------------------------------
4394 | Returns 1 if the double-precision floating-point value `a' is less than
4395 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4396 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4397 | Standard for Binary Floating-Point Arithmetic.
4398 *----------------------------------------------------------------------------*/
4399
4400 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4401 {
4402     flag aSign, bSign;
4403     uint64_t av, bv;
4404     a = float64_squash_input_denormal(a, status);
4405     b = float64_squash_input_denormal(b, status);
4406
4407     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4408          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4409        ) {
4410         if (float64_is_signaling_nan(a, status)
4411          || float64_is_signaling_nan(b, status)) {
4412             float_raise(float_flag_invalid, status);
4413         }
4414         return 0;
4415     }
4416     aSign = extractFloat64Sign( a );
4417     bSign = extractFloat64Sign( b );
4418     av = float64_val(a);
4419     bv = float64_val(b);
4420     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4421     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4422
4423 }
4424
4425 /*----------------------------------------------------------------------------
4426 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4427 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4428 | comparison is performed according to the IEC/IEEE Standard for Binary
4429 | Floating-Point Arithmetic.
4430 *----------------------------------------------------------------------------*/
4431
4432 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4433 {
4434     a = float64_squash_input_denormal(a, status);
4435     b = float64_squash_input_denormal(b, status);
4436
4437     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4438          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4439        ) {
4440         if (float64_is_signaling_nan(a, status)
4441          || float64_is_signaling_nan(b, status)) {
4442             float_raise(float_flag_invalid, status);
4443         }
4444         return 1;
4445     }
4446     return 0;
4447 }
4448
4449 /*----------------------------------------------------------------------------
4450 | Returns the result of converting the extended double-precision floating-
4451 | point value `a' to the 32-bit two's complement integer format.  The
4452 | conversion is performed according to the IEC/IEEE Standard for Binary
4453 | Floating-Point Arithmetic---which means in particular that the conversion
4454 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4455 | largest positive integer is returned.  Otherwise, if the conversion
4456 | overflows, the largest integer with the same sign as `a' is returned.
4457 *----------------------------------------------------------------------------*/
4458
4459 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4460 {
4461     flag aSign;
4462     int32_t aExp, shiftCount;
4463     uint64_t aSig;
4464
4465     if (floatx80_invalid_encoding(a)) {
4466         float_raise(float_flag_invalid, status);
4467         return 1 << 31;
4468     }
4469     aSig = extractFloatx80Frac( a );
4470     aExp = extractFloatx80Exp( a );
4471     aSign = extractFloatx80Sign( a );
4472     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4473     shiftCount = 0x4037 - aExp;
4474     if ( shiftCount <= 0 ) shiftCount = 1;
4475     shift64RightJamming( aSig, shiftCount, &aSig );
4476     return roundAndPackInt32(aSign, aSig, status);
4477
4478 }
4479
4480 /*----------------------------------------------------------------------------
4481 | Returns the result of converting the extended double-precision floating-
4482 | point value `a' to the 32-bit two's complement integer format.  The
4483 | conversion is performed according to the IEC/IEEE Standard for Binary
4484 | Floating-Point Arithmetic, except that the conversion is always rounded
4485 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4486 | Otherwise, if the conversion overflows, the largest integer with the same
4487 | sign as `a' is returned.
4488 *----------------------------------------------------------------------------*/
4489
4490 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4491 {
4492     flag aSign;
4493     int32_t aExp, shiftCount;
4494     uint64_t aSig, savedASig;
4495     int32_t z;
4496
4497     if (floatx80_invalid_encoding(a)) {
4498         float_raise(float_flag_invalid, status);
4499         return 1 << 31;
4500     }
4501     aSig = extractFloatx80Frac( a );
4502     aExp = extractFloatx80Exp( a );
4503     aSign = extractFloatx80Sign( a );
4504     if ( 0x401E < aExp ) {
4505         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4506         goto invalid;
4507     }
4508     else if ( aExp < 0x3FFF ) {
4509         if (aExp || aSig) {
4510             status->float_exception_flags |= float_flag_inexact;
4511         }
4512         return 0;
4513     }
4514     shiftCount = 0x403E - aExp;
4515     savedASig = aSig;
4516     aSig >>= shiftCount;
4517     z = aSig;
4518     if ( aSign ) z = - z;
4519     if ( ( z < 0 ) ^ aSign ) {
4520  invalid:
4521         float_raise(float_flag_invalid, status);
4522         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4523     }
4524     if ( ( aSig<<shiftCount ) != savedASig ) {
4525         status->float_exception_flags |= float_flag_inexact;
4526     }
4527     return z;
4528
4529 }
4530
4531 /*----------------------------------------------------------------------------
4532 | Returns the result of converting the extended double-precision floating-
4533 | point value `a' to the 64-bit two's complement integer format.  The
4534 | conversion is performed according to the IEC/IEEE Standard for Binary
4535 | Floating-Point Arithmetic---which means in particular that the conversion
4536 | is rounded according to the current rounding mode.  If `a' is a NaN,
4537 | the largest positive integer is returned.  Otherwise, if the conversion
4538 | overflows, the largest integer with the same sign as `a' is returned.
4539 *----------------------------------------------------------------------------*/
4540
4541 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4542 {
4543     flag aSign;
4544     int32_t aExp, shiftCount;
4545     uint64_t aSig, aSigExtra;
4546
4547     if (floatx80_invalid_encoding(a)) {
4548         float_raise(float_flag_invalid, status);
4549         return 1ULL << 63;
4550     }
4551     aSig = extractFloatx80Frac( a );
4552     aExp = extractFloatx80Exp( a );
4553     aSign = extractFloatx80Sign( a );
4554     shiftCount = 0x403E - aExp;
4555     if ( shiftCount <= 0 ) {
4556         if ( shiftCount ) {
4557             float_raise(float_flag_invalid, status);
4558             if (    ! aSign
4559                  || (    ( aExp == 0x7FFF )
4560                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4561                ) {
4562                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4563             }
4564             return (int64_t) LIT64( 0x8000000000000000 );
4565         }
4566         aSigExtra = 0;
4567     }
4568     else {
4569         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4570     }
4571     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4572
4573 }
4574
4575 /*----------------------------------------------------------------------------
4576 | Returns the result of converting the extended double-precision floating-
4577 | point value `a' to the 64-bit two's complement integer format.  The
4578 | conversion is performed according to the IEC/IEEE Standard for Binary
4579 | Floating-Point Arithmetic, except that the conversion is always rounded
4580 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4581 | Otherwise, if the conversion overflows, the largest integer with the same
4582 | sign as `a' is returned.
4583 *----------------------------------------------------------------------------*/
4584
4585 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4586 {
4587     flag aSign;
4588     int32_t aExp, shiftCount;
4589     uint64_t aSig;
4590     int64_t z;
4591
4592     if (floatx80_invalid_encoding(a)) {
4593         float_raise(float_flag_invalid, status);
4594         return 1ULL << 63;
4595     }
4596     aSig = extractFloatx80Frac( a );
4597     aExp = extractFloatx80Exp( a );
4598     aSign = extractFloatx80Sign( a );
4599     shiftCount = aExp - 0x403E;
4600     if ( 0 <= shiftCount ) {
4601         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4602         if ( ( a.high != 0xC03E ) || aSig ) {
4603             float_raise(float_flag_invalid, status);
4604             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4605                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4606             }
4607         }
4608         return (int64_t) LIT64( 0x8000000000000000 );
4609     }
4610     else if ( aExp < 0x3FFF ) {
4611         if (aExp | aSig) {
4612             status->float_exception_flags |= float_flag_inexact;
4613         }
4614         return 0;
4615     }
4616     z = aSig>>( - shiftCount );
4617     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4618         status->float_exception_flags |= float_flag_inexact;
4619     }
4620     if ( aSign ) z = - z;
4621     return z;
4622
4623 }
4624
4625 /*----------------------------------------------------------------------------
4626 | Returns the result of converting the extended double-precision floating-
4627 | point value `a' to the single-precision floating-point format.  The
4628 | conversion is performed according to the IEC/IEEE Standard for Binary
4629 | Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4631
4632 float32 floatx80_to_float32(floatx80 a, float_status *status)
4633 {
4634     flag aSign;
4635     int32_t aExp;
4636     uint64_t aSig;
4637
4638     if (floatx80_invalid_encoding(a)) {
4639         float_raise(float_flag_invalid, status);
4640         return float32_default_nan(status);
4641     }
4642     aSig = extractFloatx80Frac( a );
4643     aExp = extractFloatx80Exp( a );
4644     aSign = extractFloatx80Sign( a );
4645     if ( aExp == 0x7FFF ) {
4646         if ( (uint64_t) ( aSig<<1 ) ) {
4647             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4648         }
4649         return packFloat32( aSign, 0xFF, 0 );
4650     }
4651     shift64RightJamming( aSig, 33, &aSig );
4652     if ( aExp || aSig ) aExp -= 0x3F81;
4653     return roundAndPackFloat32(aSign, aExp, aSig, status);
4654
4655 }
4656
4657 /*----------------------------------------------------------------------------
4658 | Returns the result of converting the extended double-precision floating-
4659 | point value `a' to the double-precision floating-point format.  The
4660 | conversion is performed according to the IEC/IEEE Standard for Binary
4661 | Floating-Point Arithmetic.
4662 *----------------------------------------------------------------------------*/
4663
4664 float64 floatx80_to_float64(floatx80 a, float_status *status)
4665 {
4666     flag aSign;
4667     int32_t aExp;
4668     uint64_t aSig, zSig;
4669
4670     if (floatx80_invalid_encoding(a)) {
4671         float_raise(float_flag_invalid, status);
4672         return float64_default_nan(status);
4673     }
4674     aSig = extractFloatx80Frac( a );
4675     aExp = extractFloatx80Exp( a );
4676     aSign = extractFloatx80Sign( a );
4677     if ( aExp == 0x7FFF ) {
4678         if ( (uint64_t) ( aSig<<1 ) ) {
4679             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4680         }
4681         return packFloat64( aSign, 0x7FF, 0 );
4682     }
4683     shift64RightJamming( aSig, 1, &zSig );
4684     if ( aExp || aSig ) aExp -= 0x3C01;
4685     return roundAndPackFloat64(aSign, aExp, zSig, status);
4686
4687 }
4688
4689 /*----------------------------------------------------------------------------
4690 | Returns the result of converting the extended double-precision floating-
4691 | point value `a' to the quadruple-precision floating-point format.  The
4692 | conversion is performed according to the IEC/IEEE Standard for Binary
4693 | Floating-Point Arithmetic.
4694 *----------------------------------------------------------------------------*/
4695
4696 float128 floatx80_to_float128(floatx80 a, float_status *status)
4697 {
4698     flag aSign;
4699     int aExp;
4700     uint64_t aSig, zSig0, zSig1;
4701
4702     if (floatx80_invalid_encoding(a)) {
4703         float_raise(float_flag_invalid, status);
4704         return float128_default_nan(status);
4705     }
4706     aSig = extractFloatx80Frac( a );
4707     aExp = extractFloatx80Exp( a );
4708     aSign = extractFloatx80Sign( a );
4709     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4710         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4711     }
4712     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4713     return packFloat128( aSign, aExp, zSig0, zSig1 );
4714
4715 }
4716
4717 /*----------------------------------------------------------------------------
4718 | Rounds the extended double-precision floating-point value `a'
4719 | to the precision provided by floatx80_rounding_precision and returns the
4720 | result as an extended double-precision floating-point value.
4721 | The operation is performed according to the IEC/IEEE Standard for Binary
4722 | Floating-Point Arithmetic.
4723 *----------------------------------------------------------------------------*/
4724
4725 floatx80 floatx80_round(floatx80 a, float_status *status)
4726 {
4727     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4728                                 extractFloatx80Sign(a),
4729                                 extractFloatx80Exp(a),
4730                                 extractFloatx80Frac(a), 0, status);
4731 }
4732
4733 /*----------------------------------------------------------------------------
4734 | Rounds the extended double-precision floating-point value `a' to an integer,
4735 | and returns the result as an extended quadruple-precision floating-point
4736 | value.  The operation is performed according to the IEC/IEEE Standard for
4737 | Binary Floating-Point Arithmetic.
4738 *----------------------------------------------------------------------------*/
4739
4740 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4741 {
4742     flag aSign;
4743     int32_t aExp;
4744     uint64_t lastBitMask, roundBitsMask;
4745     floatx80 z;
4746
4747     if (floatx80_invalid_encoding(a)) {
4748         float_raise(float_flag_invalid, status);
4749         return floatx80_default_nan(status);
4750     }
4751     aExp = extractFloatx80Exp( a );
4752     if ( 0x403E <= aExp ) {
4753         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4754             return propagateFloatx80NaN(a, a, status);
4755         }
4756         return a;
4757     }
4758     if ( aExp < 0x3FFF ) {
4759         if (    ( aExp == 0 )
4760              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4761             return a;
4762         }
4763         status->float_exception_flags |= float_flag_inexact;
4764         aSign = extractFloatx80Sign( a );
4765         switch (status->float_rounding_mode) {
4766          case float_round_nearest_even:
4767             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4768                ) {
4769                 return
4770                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4771             }
4772             break;
4773         case float_round_ties_away:
4774             if (aExp == 0x3FFE) {
4775                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4776             }
4777             break;
4778          case float_round_down:
4779             return
4780                   aSign ?
4781                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4782                 : packFloatx80( 0, 0, 0 );
4783          case float_round_up:
4784             return
4785                   aSign ? packFloatx80( 1, 0, 0 )
4786                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4787         }
4788         return packFloatx80( aSign, 0, 0 );
4789     }
4790     lastBitMask = 1;
4791     lastBitMask <<= 0x403E - aExp;
4792     roundBitsMask = lastBitMask - 1;
4793     z = a;
4794     switch (status->float_rounding_mode) {
4795     case float_round_nearest_even:
4796         z.low += lastBitMask>>1;
4797         if ((z.low & roundBitsMask) == 0) {
4798             z.low &= ~lastBitMask;
4799         }
4800         break;
4801     case float_round_ties_away:
4802         z.low += lastBitMask >> 1;
4803         break;
4804     case float_round_to_zero:
4805         break;
4806     case float_round_up:
4807         if (!extractFloatx80Sign(z)) {
4808             z.low += roundBitsMask;
4809         }
4810         break;
4811     case float_round_down:
4812         if (extractFloatx80Sign(z)) {
4813             z.low += roundBitsMask;
4814         }
4815         break;
4816     default:
4817         abort();
4818     }
4819     z.low &= ~ roundBitsMask;
4820     if ( z.low == 0 ) {
4821         ++z.high;
4822         z.low = LIT64( 0x8000000000000000 );
4823     }
4824     if (z.low != a.low) {
4825         status->float_exception_flags |= float_flag_inexact;
4826     }
4827     return z;
4828
4829 }
4830
4831 /*----------------------------------------------------------------------------
4832 | Returns the result of adding the absolute values of the extended double-
4833 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4834 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4835 | The addition is performed according to the IEC/IEEE Standard for Binary
4836 | Floating-Point Arithmetic.
4837 *----------------------------------------------------------------------------*/
4838
4839 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4840                                 float_status *status)
4841 {
4842     int32_t aExp, bExp, zExp;
4843     uint64_t aSig, bSig, zSig0, zSig1;
4844     int32_t expDiff;
4845
4846     aSig = extractFloatx80Frac( a );
4847     aExp = extractFloatx80Exp( a );
4848     bSig = extractFloatx80Frac( b );
4849     bExp = extractFloatx80Exp( b );
4850     expDiff = aExp - bExp;
4851     if ( 0 < expDiff ) {
4852         if ( aExp == 0x7FFF ) {
4853             if ((uint64_t)(aSig << 1)) {
4854                 return propagateFloatx80NaN(a, b, status);
4855             }
4856             return a;
4857         }
4858         if ( bExp == 0 ) --expDiff;
4859         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4860         zExp = aExp;
4861     }
4862     else if ( expDiff < 0 ) {
4863         if ( bExp == 0x7FFF ) {
4864             if ((uint64_t)(bSig << 1)) {
4865                 return propagateFloatx80NaN(a, b, status);
4866             }
4867             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4868         }
4869         if ( aExp == 0 ) ++expDiff;
4870         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4871         zExp = bExp;
4872     }
4873     else {
4874         if ( aExp == 0x7FFF ) {
4875             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4876                 return propagateFloatx80NaN(a, b, status);
4877             }
4878             return a;
4879         }
4880         zSig1 = 0;
4881         zSig0 = aSig + bSig;
4882         if ( aExp == 0 ) {
4883             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4884             goto roundAndPack;
4885         }
4886         zExp = aExp;
4887         goto shiftRight1;
4888     }
4889     zSig0 = aSig + bSig;
4890     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4891  shiftRight1:
4892     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4893     zSig0 |= LIT64( 0x8000000000000000 );
4894     ++zExp;
4895  roundAndPack:
4896     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4897                                 zSign, zExp, zSig0, zSig1, status);
4898 }
4899
4900 /*----------------------------------------------------------------------------
4901 | Returns the result of subtracting the absolute values of the extended
4902 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4903 | difference is negated before being returned.  `zSign' is ignored if the
4904 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4905 | Standard for Binary Floating-Point Arithmetic.
4906 *----------------------------------------------------------------------------*/
4907
4908 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4909                                 float_status *status)
4910 {
4911     int32_t aExp, bExp, zExp;
4912     uint64_t aSig, bSig, zSig0, zSig1;
4913     int32_t expDiff;
4914
4915     aSig = extractFloatx80Frac( a );
4916     aExp = extractFloatx80Exp( a );
4917     bSig = extractFloatx80Frac( b );
4918     bExp = extractFloatx80Exp( b );
4919     expDiff = aExp - bExp;
4920     if ( 0 < expDiff ) goto aExpBigger;
4921     if ( expDiff < 0 ) goto bExpBigger;
4922     if ( aExp == 0x7FFF ) {
4923         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4924             return propagateFloatx80NaN(a, b, status);
4925         }
4926         float_raise(float_flag_invalid, status);
4927         return floatx80_default_nan(status);
4928     }
4929     if ( aExp == 0 ) {
4930         aExp = 1;
4931         bExp = 1;
4932     }
4933     zSig1 = 0;
4934     if ( bSig < aSig ) goto aBigger;
4935     if ( aSig < bSig ) goto bBigger;
4936     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4937  bExpBigger:
4938     if ( bExp == 0x7FFF ) {
4939         if ((uint64_t)(bSig << 1)) {
4940             return propagateFloatx80NaN(a, b, status);
4941         }
4942         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4943     }
4944     if ( aExp == 0 ) ++expDiff;
4945     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4946  bBigger:
4947     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4948     zExp = bExp;
4949     zSign ^= 1;
4950     goto normalizeRoundAndPack;
4951  aExpBigger:
4952     if ( aExp == 0x7FFF ) {
4953         if ((uint64_t)(aSig << 1)) {
4954             return propagateFloatx80NaN(a, b, status);
4955         }
4956         return a;
4957     }
4958     if ( bExp == 0 ) --expDiff;
4959     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4960  aBigger:
4961     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4962     zExp = aExp;
4963  normalizeRoundAndPack:
4964     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4965                                          zSign, zExp, zSig0, zSig1, status);
4966 }
4967
4968 /*----------------------------------------------------------------------------
4969 | Returns the result of adding the extended double-precision floating-point
4970 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4971 | Standard for Binary Floating-Point Arithmetic.
4972 *----------------------------------------------------------------------------*/
4973
4974 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4975 {
4976     flag aSign, bSign;
4977
4978     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4979         float_raise(float_flag_invalid, status);
4980         return floatx80_default_nan(status);
4981     }
4982     aSign = extractFloatx80Sign( a );
4983     bSign = extractFloatx80Sign( b );
4984     if ( aSign == bSign ) {
4985         return addFloatx80Sigs(a, b, aSign, status);
4986     }
4987     else {
4988         return subFloatx80Sigs(a, b, aSign, status);
4989     }
4990
4991 }
4992
4993 /*----------------------------------------------------------------------------
4994 | Returns the result of subtracting the extended double-precision floating-
4995 | point values `a' and `b'.  The operation is performed according to the
4996 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4997 *----------------------------------------------------------------------------*/
4998
4999 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5000 {
5001     flag aSign, bSign;
5002
5003     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5004         float_raise(float_flag_invalid, status);
5005         return floatx80_default_nan(status);
5006     }
5007     aSign = extractFloatx80Sign( a );
5008     bSign = extractFloatx80Sign( b );
5009     if ( aSign == bSign ) {
5010         return subFloatx80Sigs(a, b, aSign, status);
5011     }
5012     else {
5013         return addFloatx80Sigs(a, b, aSign, status);
5014     }
5015
5016 }
5017
5018 /*----------------------------------------------------------------------------
5019 | Returns the result of multiplying the extended double-precision floating-
5020 | point values `a' and `b'.  The operation is performed according to the
5021 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5022 *----------------------------------------------------------------------------*/
5023
5024 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5025 {
5026     flag aSign, bSign, zSign;
5027     int32_t aExp, bExp, zExp;
5028     uint64_t aSig, bSig, zSig0, zSig1;
5029
5030     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5031         float_raise(float_flag_invalid, status);
5032         return floatx80_default_nan(status);
5033     }
5034     aSig = extractFloatx80Frac( a );
5035     aExp = extractFloatx80Exp( a );
5036     aSign = extractFloatx80Sign( a );
5037     bSig = extractFloatx80Frac( b );
5038     bExp = extractFloatx80Exp( b );
5039     bSign = extractFloatx80Sign( b );
5040     zSign = aSign ^ bSign;
5041     if ( aExp == 0x7FFF ) {
5042         if (    (uint64_t) ( aSig<<1 )
5043              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5044             return propagateFloatx80NaN(a, b, status);
5045         }
5046         if ( ( bExp | bSig ) == 0 ) goto invalid;
5047         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5048     }
5049     if ( bExp == 0x7FFF ) {
5050         if ((uint64_t)(bSig << 1)) {
5051             return propagateFloatx80NaN(a, b, status);
5052         }
5053         if ( ( aExp | aSig ) == 0 ) {
5054  invalid:
5055             float_raise(float_flag_invalid, status);
5056             return floatx80_default_nan(status);
5057         }
5058         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5059     }
5060     if ( aExp == 0 ) {
5061         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5062         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5063     }
5064     if ( bExp == 0 ) {
5065         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5066         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5067     }
5068     zExp = aExp + bExp - 0x3FFE;
5069     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5070     if ( 0 < (int64_t) zSig0 ) {
5071         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5072         --zExp;
5073     }
5074     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5075                                 zSign, zExp, zSig0, zSig1, status);
5076 }
5077
5078 /*----------------------------------------------------------------------------
5079 | Returns the result of dividing the extended double-precision floating-point
5080 | value `a' by the corresponding value `b'.  The operation is performed
5081 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5082 *----------------------------------------------------------------------------*/
5083
5084 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5085 {
5086     flag aSign, bSign, zSign;
5087     int32_t aExp, bExp, zExp;
5088     uint64_t aSig, bSig, zSig0, zSig1;
5089     uint64_t rem0, rem1, rem2, term0, term1, term2;
5090
5091     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5092         float_raise(float_flag_invalid, status);
5093         return floatx80_default_nan(status);
5094     }
5095     aSig = extractFloatx80Frac( a );
5096     aExp = extractFloatx80Exp( a );
5097     aSign = extractFloatx80Sign( a );
5098     bSig = extractFloatx80Frac( b );
5099     bExp = extractFloatx80Exp( b );
5100     bSign = extractFloatx80Sign( b );
5101     zSign = aSign ^ bSign;
5102     if ( aExp == 0x7FFF ) {
5103         if ((uint64_t)(aSig << 1)) {
5104             return propagateFloatx80NaN(a, b, status);
5105         }
5106         if ( bExp == 0x7FFF ) {
5107             if ((uint64_t)(bSig << 1)) {
5108                 return propagateFloatx80NaN(a, b, status);
5109             }
5110             goto invalid;
5111         }
5112         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5113     }
5114     if ( bExp == 0x7FFF ) {
5115         if ((uint64_t)(bSig << 1)) {
5116             return propagateFloatx80NaN(a, b, status);
5117         }
5118         return packFloatx80( zSign, 0, 0 );
5119     }
5120     if ( bExp == 0 ) {
5121         if ( bSig == 0 ) {
5122             if ( ( aExp | aSig ) == 0 ) {
5123  invalid:
5124                 float_raise(float_flag_invalid, status);
5125                 return floatx80_default_nan(status);
5126             }
5127             float_raise(float_flag_divbyzero, status);
5128             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5129         }
5130         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5131     }
5132     if ( aExp == 0 ) {
5133         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5134         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5135     }
5136     zExp = aExp - bExp + 0x3FFE;
5137     rem1 = 0;
5138     if ( bSig <= aSig ) {
5139         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5140         ++zExp;
5141     }
5142     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5143     mul64To128( bSig, zSig0, &term0, &term1 );
5144     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5145     while ( (int64_t) rem0 < 0 ) {
5146         --zSig0;
5147         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5148     }
5149     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5150     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5151         mul64To128( bSig, zSig1, &term1, &term2 );
5152         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5153         while ( (int64_t) rem1 < 0 ) {
5154             --zSig1;
5155             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5156         }
5157         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5158     }
5159     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5160                                 zSign, zExp, zSig0, zSig1, status);
5161 }
5162
5163 /*----------------------------------------------------------------------------
5164 | Returns the remainder of the extended double-precision floating-point value
5165 | `a' with respect to the corresponding value `b'.  The operation is performed
5166 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5167 *----------------------------------------------------------------------------*/
5168
5169 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5170 {
5171     flag aSign, zSign;
5172     int32_t aExp, bExp, expDiff;
5173     uint64_t aSig0, aSig1, bSig;
5174     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5175
5176     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5177         float_raise(float_flag_invalid, status);
5178         return floatx80_default_nan(status);
5179     }
5180     aSig0 = extractFloatx80Frac( a );
5181     aExp = extractFloatx80Exp( a );
5182     aSign = extractFloatx80Sign( a );
5183     bSig = extractFloatx80Frac( b );
5184     bExp = extractFloatx80Exp( b );
5185     if ( aExp == 0x7FFF ) {
5186         if (    (uint64_t) ( aSig0<<1 )
5187              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5188             return propagateFloatx80NaN(a, b, status);
5189         }
5190         goto invalid;
5191     }
5192     if ( bExp == 0x7FFF ) {
5193         if ((uint64_t)(bSig << 1)) {
5194             return propagateFloatx80NaN(a, b, status);
5195         }
5196         return a;
5197     }
5198     if ( bExp == 0 ) {
5199         if ( bSig == 0 ) {
5200  invalid:
5201             float_raise(float_flag_invalid, status);
5202             return floatx80_default_nan(status);
5203         }
5204         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5205     }
5206     if ( aExp == 0 ) {
5207         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5208         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5209     }
5210     bSig |= LIT64( 0x8000000000000000 );
5211     zSign = aSign;
5212     expDiff = aExp - bExp;
5213     aSig1 = 0;
5214     if ( expDiff < 0 ) {
5215         if ( expDiff < -1 ) return a;
5216         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5217         expDiff = 0;
5218     }
5219     q = ( bSig <= aSig0 );
5220     if ( q ) aSig0 -= bSig;
5221     expDiff -= 64;
5222     while ( 0 < expDiff ) {
5223         q = estimateDiv128To64( aSig0, aSig1, bSig );
5224         q = ( 2 < q ) ? q - 2 : 0;
5225         mul64To128( bSig, q, &term0, &term1 );
5226         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5227         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5228         expDiff -= 62;
5229     }
5230     expDiff += 64;
5231     if ( 0 < expDiff ) {
5232         q = estimateDiv128To64( aSig0, aSig1, bSig );
5233         q = ( 2 < q ) ? q - 2 : 0;
5234         q >>= 64 - expDiff;
5235         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5236         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5237         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5238         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5239             ++q;
5240             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5241         }
5242     }
5243     else {
5244         term1 = 0;
5245         term0 = bSig;
5246     }
5247     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5248     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5249          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5250               && ( q & 1 ) )
5251        ) {
5252         aSig0 = alternateASig0;
5253         aSig1 = alternateASig1;
5254         zSign = ! zSign;
5255     }
5256     return
5257         normalizeRoundAndPackFloatx80(
5258             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5259
5260 }
5261
5262 /*----------------------------------------------------------------------------
5263 | Returns the square root of the extended double-precision floating-point
5264 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5265 | for Binary Floating-Point Arithmetic.
5266 *----------------------------------------------------------------------------*/
5267
5268 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5269 {
5270     flag aSign;
5271     int32_t aExp, zExp;
5272     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5273     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5274
5275     if (floatx80_invalid_encoding(a)) {
5276         float_raise(float_flag_invalid, status);
5277         return floatx80_default_nan(status);
5278     }
5279     aSig0 = extractFloatx80Frac( a );
5280     aExp = extractFloatx80Exp( a );
5281     aSign = extractFloatx80Sign( a );
5282     if ( aExp == 0x7FFF ) {
5283         if ((uint64_t)(aSig0 << 1)) {
5284             return propagateFloatx80NaN(a, a, status);
5285         }
5286         if ( ! aSign ) return a;
5287         goto invalid;
5288     }
5289     if ( aSign ) {
5290         if ( ( aExp | aSig0 ) == 0 ) return a;
5291  invalid:
5292         float_raise(float_flag_invalid, status);
5293         return floatx80_default_nan(status);
5294     }
5295     if ( aExp == 0 ) {
5296         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5297         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5298     }
5299     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5300     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5301     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5302     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5303     doubleZSig0 = zSig0<<1;
5304     mul64To128( zSig0, zSig0, &term0, &term1 );
5305     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5306     while ( (int64_t) rem0 < 0 ) {
5307         --zSig0;
5308         doubleZSig0 -= 2;
5309         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5310     }
5311     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5312     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5313         if ( zSig1 == 0 ) zSig1 = 1;
5314         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5315         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5316         mul64To128( zSig1, zSig1, &term2, &term3 );
5317         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5318         while ( (int64_t) rem1 < 0 ) {
5319             --zSig1;
5320             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5321             term3 |= 1;
5322             term2 |= doubleZSig0;
5323             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5324         }
5325         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5326     }
5327     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5328     zSig0 |= doubleZSig0;
5329     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5330                                 0, zExp, zSig0, zSig1, status);
5331 }
5332
5333 /*----------------------------------------------------------------------------
5334 | Returns 1 if the extended double-precision floating-point value `a' is equal
5335 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5336 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5338 *----------------------------------------------------------------------------*/
5339
5340 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5341 {
5342
5343     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5344         || (extractFloatx80Exp(a) == 0x7FFF
5345             && (uint64_t) (extractFloatx80Frac(a) << 1))
5346         || (extractFloatx80Exp(b) == 0x7FFF
5347             && (uint64_t) (extractFloatx80Frac(b) << 1))
5348        ) {
5349         float_raise(float_flag_invalid, status);
5350         return 0;
5351     }
5352     return
5353            ( a.low == b.low )
5354         && (    ( a.high == b.high )
5355              || (    ( a.low == 0 )
5356                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5357            );
5358
5359 }
5360
5361 /*----------------------------------------------------------------------------
5362 | Returns 1 if the extended double-precision floating-point value `a' is
5363 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5364 | invalid exception is raised if either operand is a NaN.  The comparison is
5365 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5366 | Arithmetic.
5367 *----------------------------------------------------------------------------*/
5368
5369 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5370 {
5371     flag aSign, bSign;
5372
5373     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5374         || (extractFloatx80Exp(a) == 0x7FFF
5375             && (uint64_t) (extractFloatx80Frac(a) << 1))
5376         || (extractFloatx80Exp(b) == 0x7FFF
5377             && (uint64_t) (extractFloatx80Frac(b) << 1))
5378        ) {
5379         float_raise(float_flag_invalid, status);
5380         return 0;
5381     }
5382     aSign = extractFloatx80Sign( a );
5383     bSign = extractFloatx80Sign( b );
5384     if ( aSign != bSign ) {
5385         return
5386                aSign
5387             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5388                  == 0 );
5389     }
5390     return
5391           aSign ? le128( b.high, b.low, a.high, a.low )
5392         : le128( a.high, a.low, b.high, b.low );
5393
5394 }
5395
5396 /*----------------------------------------------------------------------------
5397 | Returns 1 if the extended double-precision floating-point value `a' is
5398 | less than the corresponding value `b', and 0 otherwise.  The invalid
5399 | exception is raised if either operand is a NaN.  The comparison is performed
5400 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5401 *----------------------------------------------------------------------------*/
5402
5403 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5404 {
5405     flag aSign, bSign;
5406
5407     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5408         || (extractFloatx80Exp(a) == 0x7FFF
5409             && (uint64_t) (extractFloatx80Frac(a) << 1))
5410         || (extractFloatx80Exp(b) == 0x7FFF
5411             && (uint64_t) (extractFloatx80Frac(b) << 1))
5412        ) {
5413         float_raise(float_flag_invalid, status);
5414         return 0;
5415     }
5416     aSign = extractFloatx80Sign( a );
5417     bSign = extractFloatx80Sign( b );
5418     if ( aSign != bSign ) {
5419         return
5420                aSign
5421             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5422                  != 0 );
5423     }
5424     return
5425           aSign ? lt128( b.high, b.low, a.high, a.low )
5426         : lt128( a.high, a.low, b.high, b.low );
5427
5428 }
5429
5430 /*----------------------------------------------------------------------------
5431 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5432 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5433 | either operand is a NaN.   The comparison is performed according to the
5434 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5435 *----------------------------------------------------------------------------*/
5436 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5437 {
5438     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5439         || (extractFloatx80Exp(a) == 0x7FFF
5440             && (uint64_t) (extractFloatx80Frac(a) << 1))
5441         || (extractFloatx80Exp(b) == 0x7FFF
5442             && (uint64_t) (extractFloatx80Frac(b) << 1))
5443        ) {
5444         float_raise(float_flag_invalid, status);
5445         return 1;
5446     }
5447     return 0;
5448 }
5449
5450 /*----------------------------------------------------------------------------
5451 | Returns 1 if the extended double-precision floating-point value `a' is
5452 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5453 | cause an exception.  The comparison is performed according to the IEC/IEEE
5454 | Standard for Binary Floating-Point Arithmetic.
5455 *----------------------------------------------------------------------------*/
5456
5457 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5458 {
5459
5460     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5461         float_raise(float_flag_invalid, status);
5462         return 0;
5463     }
5464     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5465               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5466          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5467               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5468        ) {
5469         if (floatx80_is_signaling_nan(a, status)
5470          || floatx80_is_signaling_nan(b, status)) {
5471             float_raise(float_flag_invalid, status);
5472         }
5473         return 0;
5474     }
5475     return
5476            ( a.low == b.low )
5477         && (    ( a.high == b.high )
5478              || (    ( a.low == 0 )
5479                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5480            );
5481
5482 }
5483
5484 /*----------------------------------------------------------------------------
5485 | Returns 1 if the extended double-precision floating-point value `a' is less
5486 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5487 | do not cause an exception.  Otherwise, the comparison is performed according
5488 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5489 *----------------------------------------------------------------------------*/
5490
5491 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5492 {
5493     flag aSign, bSign;
5494
5495     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5496         float_raise(float_flag_invalid, status);
5497         return 0;
5498     }
5499     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5500               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5501          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5502               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5503        ) {
5504         if (floatx80_is_signaling_nan(a, status)
5505          || floatx80_is_signaling_nan(b, status)) {
5506             float_raise(float_flag_invalid, status);
5507         }
5508         return 0;
5509     }
5510     aSign = extractFloatx80Sign( a );
5511     bSign = extractFloatx80Sign( b );
5512     if ( aSign != bSign ) {
5513         return
5514                aSign
5515             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5516                  == 0 );
5517     }
5518     return
5519           aSign ? le128( b.high, b.low, a.high, a.low )
5520         : le128( a.high, a.low, b.high, b.low );
5521
5522 }
5523
5524 /*----------------------------------------------------------------------------
5525 | Returns 1 if the extended double-precision floating-point value `a' is less
5526 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5527 | an exception.  Otherwise, the comparison is performed according to the
5528 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5529 *----------------------------------------------------------------------------*/
5530
5531 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5532 {
5533     flag aSign, bSign;
5534
5535     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5536         float_raise(float_flag_invalid, status);
5537         return 0;
5538     }
5539     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5540               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5541          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5542               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5543        ) {
5544         if (floatx80_is_signaling_nan(a, status)
5545          || floatx80_is_signaling_nan(b, status)) {
5546             float_raise(float_flag_invalid, status);
5547         }
5548         return 0;
5549     }
5550     aSign = extractFloatx80Sign( a );
5551     bSign = extractFloatx80Sign( b );
5552     if ( aSign != bSign ) {
5553         return
5554                aSign
5555             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5556                  != 0 );
5557     }
5558     return
5559           aSign ? lt128( b.high, b.low, a.high, a.low )
5560         : lt128( a.high, a.low, b.high, b.low );
5561
5562 }
5563
5564 /*----------------------------------------------------------------------------
5565 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5566 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5567 | The comparison is performed according to the IEC/IEEE Standard for Binary
5568 | Floating-Point Arithmetic.
5569 *----------------------------------------------------------------------------*/
5570 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5571 {
5572     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5573         float_raise(float_flag_invalid, status);
5574         return 1;
5575     }
5576     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5577               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5578          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5579               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5580        ) {
5581         if (floatx80_is_signaling_nan(a, status)
5582          || floatx80_is_signaling_nan(b, status)) {
5583             float_raise(float_flag_invalid, status);
5584         }
5585         return 1;
5586     }
5587     return 0;
5588 }
5589
5590 /*----------------------------------------------------------------------------
5591 | Returns the result of converting the quadruple-precision floating-point
5592 | value `a' to the 32-bit two's complement integer format.  The conversion
5593 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5594 | Arithmetic---which means in particular that the conversion is rounded
5595 | according to the current rounding mode.  If `a' is a NaN, the largest
5596 | positive integer is returned.  Otherwise, if the conversion overflows, the
5597 | largest integer with the same sign as `a' is returned.
5598 *----------------------------------------------------------------------------*/
5599
5600 int32_t float128_to_int32(float128 a, float_status *status)
5601 {
5602     flag aSign;
5603     int32_t aExp, shiftCount;
5604     uint64_t aSig0, aSig1;
5605
5606     aSig1 = extractFloat128Frac1( a );
5607     aSig0 = extractFloat128Frac0( a );
5608     aExp = extractFloat128Exp( a );
5609     aSign = extractFloat128Sign( a );
5610     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5611     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5612     aSig0 |= ( aSig1 != 0 );
5613     shiftCount = 0x4028 - aExp;
5614     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5615     return roundAndPackInt32(aSign, aSig0, status);
5616
5617 }
5618
5619 /*----------------------------------------------------------------------------
5620 | Returns the result of converting the quadruple-precision floating-point
5621 | value `a' to the 32-bit two's complement integer format.  The conversion
5622 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5623 | Arithmetic, except that the conversion is always rounded toward zero.  If
5624 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5625 | conversion overflows, the largest integer with the same sign as `a' is
5626 | returned.
5627 *----------------------------------------------------------------------------*/
5628
5629 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5630 {
5631     flag aSign;
5632     int32_t aExp, shiftCount;
5633     uint64_t aSig0, aSig1, savedASig;
5634     int32_t z;
5635
5636     aSig1 = extractFloat128Frac1( a );
5637     aSig0 = extractFloat128Frac0( a );
5638     aExp = extractFloat128Exp( a );
5639     aSign = extractFloat128Sign( a );
5640     aSig0 |= ( aSig1 != 0 );
5641     if ( 0x401E < aExp ) {
5642         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5643         goto invalid;
5644     }
5645     else if ( aExp < 0x3FFF ) {
5646         if (aExp || aSig0) {
5647             status->float_exception_flags |= float_flag_inexact;
5648         }
5649         return 0;
5650     }
5651     aSig0 |= LIT64( 0x0001000000000000 );
5652     shiftCount = 0x402F - aExp;
5653     savedASig = aSig0;
5654     aSig0 >>= shiftCount;
5655     z = aSig0;
5656     if ( aSign ) z = - z;
5657     if ( ( z < 0 ) ^ aSign ) {
5658  invalid:
5659         float_raise(float_flag_invalid, status);
5660         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5661     }
5662     if ( ( aSig0<<shiftCount ) != savedASig ) {
5663         status->float_exception_flags |= float_flag_inexact;
5664     }
5665     return z;
5666
5667 }
5668
5669 /*----------------------------------------------------------------------------
5670 | Returns the result of converting the quadruple-precision floating-point
5671 | value `a' to the 64-bit two's complement integer format.  The conversion
5672 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5673 | Arithmetic---which means in particular that the conversion is rounded
5674 | according to the current rounding mode.  If `a' is a NaN, the largest
5675 | positive integer is returned.  Otherwise, if the conversion overflows, the
5676 | largest integer with the same sign as `a' is returned.
5677 *----------------------------------------------------------------------------*/
5678
5679 int64_t float128_to_int64(float128 a, float_status *status)
5680 {
5681     flag aSign;
5682     int32_t aExp, shiftCount;
5683     uint64_t aSig0, aSig1;
5684
5685     aSig1 = extractFloat128Frac1( a );
5686     aSig0 = extractFloat128Frac0( a );
5687     aExp = extractFloat128Exp( a );
5688     aSign = extractFloat128Sign( a );
5689     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5690     shiftCount = 0x402F - aExp;
5691     if ( shiftCount <= 0 ) {
5692         if ( 0x403E < aExp ) {
5693             float_raise(float_flag_invalid, status);
5694             if (    ! aSign
5695                  || (    ( aExp == 0x7FFF )
5696                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5697                     )
5698                ) {
5699                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5700             }
5701             return (int64_t) LIT64( 0x8000000000000000 );
5702         }
5703         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5704     }
5705     else {
5706         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5707     }
5708     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5709
5710 }
5711
5712 /*----------------------------------------------------------------------------
5713 | Returns the result of converting the quadruple-precision floating-point
5714 | value `a' to the 64-bit two's complement integer format.  The conversion
5715 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5716 | Arithmetic, except that the conversion is always rounded toward zero.
5717 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5718 | the conversion overflows, the largest integer with the same sign as `a' is
5719 | returned.
5720 *----------------------------------------------------------------------------*/
5721
5722 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5723 {
5724     flag aSign;
5725     int32_t aExp, shiftCount;
5726     uint64_t aSig0, aSig1;
5727     int64_t z;
5728
5729     aSig1 = extractFloat128Frac1( a );
5730     aSig0 = extractFloat128Frac0( a );
5731     aExp = extractFloat128Exp( a );
5732     aSign = extractFloat128Sign( a );
5733     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5734     shiftCount = aExp - 0x402F;
5735     if ( 0 < shiftCount ) {
5736         if ( 0x403E <= aExp ) {
5737             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5738             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5739                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5740                 if (aSig1) {
5741                     status->float_exception_flags |= float_flag_inexact;
5742                 }
5743             }
5744             else {
5745                 float_raise(float_flag_invalid, status);
5746                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5747                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5748                 }
5749             }
5750             return (int64_t) LIT64( 0x8000000000000000 );
5751         }
5752         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5753         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5754             status->float_exception_flags |= float_flag_inexact;
5755         }
5756     }
5757     else {
5758         if ( aExp < 0x3FFF ) {
5759             if ( aExp | aSig0 | aSig1 ) {
5760                 status->float_exception_flags |= float_flag_inexact;
5761             }
5762             return 0;
5763         }
5764         z = aSig0>>( - shiftCount );
5765         if (    aSig1
5766              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5767             status->float_exception_flags |= float_flag_inexact;
5768         }
5769     }
5770     if ( aSign ) z = - z;
5771     return z;
5772
5773 }
5774
5775 /*----------------------------------------------------------------------------
5776 | Returns the result of converting the quadruple-precision floating-point value
5777 | `a' to the 64-bit unsigned integer format.  The conversion is
5778 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5779 | Arithmetic---which means in particular that the conversion is rounded
5780 | according to the current rounding mode.  If `a' is a NaN, the largest
5781 | positive integer is returned.  If the conversion overflows, the
5782 | largest unsigned integer is returned.  If 'a' is negative, the value is
5783 | rounded and zero is returned; negative values that do not round to zero
5784 | will raise the inexact exception.
5785 *----------------------------------------------------------------------------*/
5786
5787 uint64_t float128_to_uint64(float128 a, float_status *status)
5788 {
5789     flag aSign;
5790     int aExp;
5791     int shiftCount;
5792     uint64_t aSig0, aSig1;
5793
5794     aSig0 = extractFloat128Frac0(a);
5795     aSig1 = extractFloat128Frac1(a);
5796     aExp = extractFloat128Exp(a);
5797     aSign = extractFloat128Sign(a);
5798     if (aSign && (aExp > 0x3FFE)) {
5799         float_raise(float_flag_invalid, status);
5800         if (float128_is_any_nan(a)) {
5801             return LIT64(0xFFFFFFFFFFFFFFFF);
5802         } else {
5803             return 0;
5804         }
5805     }
5806     if (aExp) {
5807         aSig0 |= LIT64(0x0001000000000000);
5808     }
5809     shiftCount = 0x402F - aExp;
5810     if (shiftCount <= 0) {
5811         if (0x403E < aExp) {
5812             float_raise(float_flag_invalid, status);
5813             return LIT64(0xFFFFFFFFFFFFFFFF);
5814         }
5815         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5816     } else {
5817         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5818     }
5819     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5820 }
5821
5822 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5823 {
5824     uint64_t v;
5825     signed char current_rounding_mode = status->float_rounding_mode;
5826
5827     set_float_rounding_mode(float_round_to_zero, status);
5828     v = float128_to_uint64(a, status);
5829     set_float_rounding_mode(current_rounding_mode, status);
5830
5831     return v;
5832 }
5833
5834 /*----------------------------------------------------------------------------
5835 | Returns the result of converting the quadruple-precision floating-point
5836 | value `a' to the 32-bit unsigned integer format.  The conversion
5837 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5838 | Arithmetic except that the conversion is always rounded toward zero.
5839 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5840 | if the conversion overflows, the largest unsigned integer is returned.
5841 | If 'a' is negative, the value is rounded and zero is returned; negative
5842 | values that do not round to zero will raise the inexact exception.
5843 *----------------------------------------------------------------------------*/
5844
5845 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5846 {
5847     uint64_t v;
5848     uint32_t res;
5849     int old_exc_flags = get_float_exception_flags(status);
5850
5851     v = float128_to_uint64_round_to_zero(a, status);
5852     if (v > 0xffffffff) {
5853         res = 0xffffffff;
5854     } else {
5855         return v;
5856     }
5857     set_float_exception_flags(old_exc_flags, status);
5858     float_raise(float_flag_invalid, status);
5859     return res;
5860 }
5861
5862 /*----------------------------------------------------------------------------
5863 | Returns the result of converting the quadruple-precision floating-point
5864 | value `a' to the single-precision floating-point format.  The conversion
5865 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5866 | Arithmetic.
5867 *----------------------------------------------------------------------------*/
5868
5869 float32 float128_to_float32(float128 a, float_status *status)
5870 {
5871     flag aSign;
5872     int32_t aExp;
5873     uint64_t aSig0, aSig1;
5874     uint32_t zSig;
5875
5876     aSig1 = extractFloat128Frac1( a );
5877     aSig0 = extractFloat128Frac0( a );
5878     aExp = extractFloat128Exp( a );
5879     aSign = extractFloat128Sign( a );
5880     if ( aExp == 0x7FFF ) {
5881         if ( aSig0 | aSig1 ) {
5882             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5883         }
5884         return packFloat32( aSign, 0xFF, 0 );
5885     }
5886     aSig0 |= ( aSig1 != 0 );
5887     shift64RightJamming( aSig0, 18, &aSig0 );
5888     zSig = aSig0;
5889     if ( aExp || zSig ) {
5890         zSig |= 0x40000000;
5891         aExp -= 0x3F81;
5892     }
5893     return roundAndPackFloat32(aSign, aExp, zSig, status);
5894
5895 }
5896
5897 /*----------------------------------------------------------------------------
5898 | Returns the result of converting the quadruple-precision floating-point
5899 | value `a' to the double-precision floating-point format.  The conversion
5900 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5901 | Arithmetic.
5902 *----------------------------------------------------------------------------*/
5903
5904 float64 float128_to_float64(float128 a, float_status *status)
5905 {
5906     flag aSign;
5907     int32_t aExp;
5908     uint64_t aSig0, aSig1;
5909
5910     aSig1 = extractFloat128Frac1( a );
5911     aSig0 = extractFloat128Frac0( a );
5912     aExp = extractFloat128Exp( a );
5913     aSign = extractFloat128Sign( a );
5914     if ( aExp == 0x7FFF ) {
5915         if ( aSig0 | aSig1 ) {
5916             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5917         }
5918         return packFloat64( aSign, 0x7FF, 0 );
5919     }
5920     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5921     aSig0 |= ( aSig1 != 0 );
5922     if ( aExp || aSig0 ) {
5923         aSig0 |= LIT64( 0x4000000000000000 );
5924         aExp -= 0x3C01;
5925     }
5926     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5927
5928 }
5929
5930 /*----------------------------------------------------------------------------
5931 | Returns the result of converting the quadruple-precision floating-point
5932 | value `a' to the extended double-precision floating-point format.  The
5933 | conversion is performed according to the IEC/IEEE Standard for Binary
5934 | Floating-Point Arithmetic.
5935 *----------------------------------------------------------------------------*/
5936
5937 floatx80 float128_to_floatx80(float128 a, float_status *status)
5938 {
5939     flag aSign;
5940     int32_t aExp;
5941     uint64_t aSig0, aSig1;
5942
5943     aSig1 = extractFloat128Frac1( a );
5944     aSig0 = extractFloat128Frac0( a );
5945     aExp = extractFloat128Exp( a );
5946     aSign = extractFloat128Sign( a );
5947     if ( aExp == 0x7FFF ) {
5948         if ( aSig0 | aSig1 ) {
5949             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5950         }
5951         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5952     }
5953     if ( aExp == 0 ) {
5954         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5955         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5956     }
5957     else {
5958         aSig0 |= LIT64( 0x0001000000000000 );
5959     }
5960     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5961     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5962
5963 }
5964
5965 /*----------------------------------------------------------------------------
5966 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5967 | returns the result as a quadruple-precision floating-point value.  The
5968 | operation is performed according to the IEC/IEEE Standard for Binary
5969 | Floating-Point Arithmetic.
5970 *----------------------------------------------------------------------------*/
5971
5972 float128 float128_round_to_int(float128 a, float_status *status)
5973 {
5974     flag aSign;
5975     int32_t aExp;
5976     uint64_t lastBitMask, roundBitsMask;
5977     float128 z;
5978
5979     aExp = extractFloat128Exp( a );
5980     if ( 0x402F <= aExp ) {
5981         if ( 0x406F <= aExp ) {
5982             if (    ( aExp == 0x7FFF )
5983                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5984                ) {
5985                 return propagateFloat128NaN(a, a, status);
5986             }
5987             return a;
5988         }
5989         lastBitMask = 1;
5990         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5991         roundBitsMask = lastBitMask - 1;
5992         z = a;
5993         switch (status->float_rounding_mode) {
5994         case float_round_nearest_even:
5995             if ( lastBitMask ) {
5996                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5997                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5998             }
5999             else {
6000                 if ( (int64_t) z.low < 0 ) {
6001                     ++z.high;
6002                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6003                 }
6004             }
6005             break;
6006         case float_round_ties_away:
6007             if (lastBitMask) {
6008                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6009             } else {
6010                 if ((int64_t) z.low < 0) {
6011                     ++z.high;
6012                 }
6013             }
6014             break;
6015         case float_round_to_zero:
6016             break;
6017         case float_round_up:
6018             if (!extractFloat128Sign(z)) {
6019                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6020             }
6021             break;
6022         case float_round_down:
6023             if (extractFloat128Sign(z)) {
6024                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6025             }
6026             break;
6027         default:
6028             abort();
6029         }
6030         z.low &= ~ roundBitsMask;
6031     }
6032     else {
6033         if ( aExp < 0x3FFF ) {
6034             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6035             status->float_exception_flags |= float_flag_inexact;
6036             aSign = extractFloat128Sign( a );
6037             switch (status->float_rounding_mode) {
6038              case float_round_nearest_even:
6039                 if (    ( aExp == 0x3FFE )
6040                      && (   extractFloat128Frac0( a )
6041                           | extractFloat128Frac1( a ) )
6042                    ) {
6043                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6044                 }
6045                 break;
6046             case float_round_ties_away:
6047                 if (aExp == 0x3FFE) {
6048                     return packFloat128(aSign, 0x3FFF, 0, 0);
6049                 }
6050                 break;
6051              case float_round_down:
6052                 return
6053                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6054                     : packFloat128( 0, 0, 0, 0 );
6055              case float_round_up:
6056                 return
6057                       aSign ? packFloat128( 1, 0, 0, 0 )
6058                     : packFloat128( 0, 0x3FFF, 0, 0 );
6059             }
6060             return packFloat128( aSign, 0, 0, 0 );
6061         }
6062         lastBitMask = 1;
6063         lastBitMask <<= 0x402F - aExp;
6064         roundBitsMask = lastBitMask - 1;
6065         z.low = 0;
6066         z.high = a.high;
6067         switch (status->float_rounding_mode) {
6068         case float_round_nearest_even:
6069             z.high += lastBitMask>>1;
6070             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6071                 z.high &= ~ lastBitMask;
6072             }
6073             break;
6074         case float_round_ties_away:
6075             z.high += lastBitMask>>1;
6076             break;
6077         case float_round_to_zero:
6078             break;
6079         case float_round_up:
6080             if (!extractFloat128Sign(z)) {
6081                 z.high |= ( a.low != 0 );
6082                 z.high += roundBitsMask;
6083             }
6084             break;
6085         case float_round_down:
6086             if (extractFloat128Sign(z)) {
6087                 z.high |= (a.low != 0);
6088                 z.high += roundBitsMask;
6089             }
6090             break;
6091         default:
6092             abort();
6093         }
6094         z.high &= ~ roundBitsMask;
6095     }
6096     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6097         status->float_exception_flags |= float_flag_inexact;
6098     }
6099     return z;
6100
6101 }
6102
6103 /*----------------------------------------------------------------------------
6104 | Returns the result of adding the absolute values of the quadruple-precision
6105 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6106 | before being returned.  `zSign' is ignored if the result is a NaN.
6107 | The addition is performed according to the IEC/IEEE Standard for Binary
6108 | Floating-Point Arithmetic.
6109 *----------------------------------------------------------------------------*/
6110
6111 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6112                                 float_status *status)
6113 {
6114     int32_t aExp, bExp, zExp;
6115     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6116     int32_t expDiff;
6117
6118     aSig1 = extractFloat128Frac1( a );
6119     aSig0 = extractFloat128Frac0( a );
6120     aExp = extractFloat128Exp( a );
6121     bSig1 = extractFloat128Frac1( b );
6122     bSig0 = extractFloat128Frac0( b );
6123     bExp = extractFloat128Exp( b );
6124     expDiff = aExp - bExp;
6125     if ( 0 < expDiff ) {
6126         if ( aExp == 0x7FFF ) {
6127             if (aSig0 | aSig1) {
6128                 return propagateFloat128NaN(a, b, status);
6129             }
6130             return a;
6131         }
6132         if ( bExp == 0 ) {
6133             --expDiff;
6134         }
6135         else {
6136             bSig0 |= LIT64( 0x0001000000000000 );
6137         }
6138         shift128ExtraRightJamming(
6139             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6140         zExp = aExp;
6141     }
6142     else if ( expDiff < 0 ) {
6143         if ( bExp == 0x7FFF ) {
6144             if (bSig0 | bSig1) {
6145                 return propagateFloat128NaN(a, b, status);
6146             }
6147             return packFloat128( zSign, 0x7FFF, 0, 0 );
6148         }
6149         if ( aExp == 0 ) {
6150             ++expDiff;
6151         }
6152         else {
6153             aSig0 |= LIT64( 0x0001000000000000 );
6154         }
6155         shift128ExtraRightJamming(
6156             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6157         zExp = bExp;
6158     }
6159     else {
6160         if ( aExp == 0x7FFF ) {
6161             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6162                 return propagateFloat128NaN(a, b, status);
6163             }
6164             return a;
6165         }
6166         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6167         if ( aExp == 0 ) {
6168             if (status->flush_to_zero) {
6169                 if (zSig0 | zSig1) {
6170                     float_raise(float_flag_output_denormal, status);
6171                 }
6172                 return packFloat128(zSign, 0, 0, 0);
6173             }
6174             return packFloat128( zSign, 0, zSig0, zSig1 );
6175         }
6176         zSig2 = 0;
6177         zSig0 |= LIT64( 0x0002000000000000 );
6178         zExp = aExp;
6179         goto shiftRight1;
6180     }
6181     aSig0 |= LIT64( 0x0001000000000000 );
6182     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6183     --zExp;
6184     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6185     ++zExp;
6186  shiftRight1:
6187     shift128ExtraRightJamming(
6188         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6189  roundAndPack:
6190     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6191
6192 }
6193
6194 /*----------------------------------------------------------------------------
6195 | Returns the result of subtracting the absolute values of the quadruple-
6196 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6197 | difference is negated before being returned.  `zSign' is ignored if the
6198 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6199 | Standard for Binary Floating-Point Arithmetic.
6200 *----------------------------------------------------------------------------*/
6201
6202 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6203                                 float_status *status)
6204 {
6205     int32_t aExp, bExp, zExp;
6206     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6207     int32_t expDiff;
6208
6209     aSig1 = extractFloat128Frac1( a );
6210     aSig0 = extractFloat128Frac0( a );
6211     aExp = extractFloat128Exp( a );
6212     bSig1 = extractFloat128Frac1( b );
6213     bSig0 = extractFloat128Frac0( b );
6214     bExp = extractFloat128Exp( b );
6215     expDiff = aExp - bExp;
6216     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6217     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6218     if ( 0 < expDiff ) goto aExpBigger;
6219     if ( expDiff < 0 ) goto bExpBigger;
6220     if ( aExp == 0x7FFF ) {
6221         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6222             return propagateFloat128NaN(a, b, status);
6223         }
6224         float_raise(float_flag_invalid, status);
6225         return float128_default_nan(status);
6226     }
6227     if ( aExp == 0 ) {
6228         aExp = 1;
6229         bExp = 1;
6230     }
6231     if ( bSig0 < aSig0 ) goto aBigger;
6232     if ( aSig0 < bSig0 ) goto bBigger;
6233     if ( bSig1 < aSig1 ) goto aBigger;
6234     if ( aSig1 < bSig1 ) goto bBigger;
6235     return packFloat128(status->float_rounding_mode == float_round_down,
6236                         0, 0, 0);
6237  bExpBigger:
6238     if ( bExp == 0x7FFF ) {
6239         if (bSig0 | bSig1) {
6240             return propagateFloat128NaN(a, b, status);
6241         }
6242         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6243     }
6244     if ( aExp == 0 ) {
6245         ++expDiff;
6246     }
6247     else {
6248         aSig0 |= LIT64( 0x4000000000000000 );
6249     }
6250     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6251     bSig0 |= LIT64( 0x4000000000000000 );
6252  bBigger:
6253     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6254     zExp = bExp;
6255     zSign ^= 1;
6256     goto normalizeRoundAndPack;
6257  aExpBigger:
6258     if ( aExp == 0x7FFF ) {
6259         if (aSig0 | aSig1) {
6260             return propagateFloat128NaN(a, b, status);
6261         }
6262         return a;
6263     }
6264     if ( bExp == 0 ) {
6265         --expDiff;
6266     }
6267     else {
6268         bSig0 |= LIT64( 0x4000000000000000 );
6269     }
6270     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6271     aSig0 |= LIT64( 0x4000000000000000 );
6272  aBigger:
6273     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6274     zExp = aExp;
6275  normalizeRoundAndPack:
6276     --zExp;
6277     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6278                                          status);
6279
6280 }
6281
6282 /*----------------------------------------------------------------------------
6283 | Returns the result of adding the quadruple-precision floating-point values
6284 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6285 | for Binary Floating-Point Arithmetic.
6286 *----------------------------------------------------------------------------*/
6287
6288 float128 float128_add(float128 a, float128 b, float_status *status)
6289 {
6290     flag aSign, bSign;
6291
6292     aSign = extractFloat128Sign( a );
6293     bSign = extractFloat128Sign( b );
6294     if ( aSign == bSign ) {
6295         return addFloat128Sigs(a, b, aSign, status);
6296     }
6297     else {
6298         return subFloat128Sigs(a, b, aSign, status);
6299     }
6300
6301 }
6302
6303 /*----------------------------------------------------------------------------
6304 | Returns the result of subtracting the quadruple-precision floating-point
6305 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6306 | Standard for Binary Floating-Point Arithmetic.
6307 *----------------------------------------------------------------------------*/
6308
6309 float128 float128_sub(float128 a, float128 b, float_status *status)
6310 {
6311     flag aSign, bSign;
6312
6313     aSign = extractFloat128Sign( a );
6314     bSign = extractFloat128Sign( b );
6315     if ( aSign == bSign ) {
6316         return subFloat128Sigs(a, b, aSign, status);
6317     }
6318     else {
6319         return addFloat128Sigs(a, b, aSign, status);
6320     }
6321
6322 }
6323
6324 /*----------------------------------------------------------------------------
6325 | Returns the result of multiplying the quadruple-precision floating-point
6326 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6327 | Standard for Binary Floating-Point Arithmetic.
6328 *----------------------------------------------------------------------------*/
6329
6330 float128 float128_mul(float128 a, float128 b, float_status *status)
6331 {
6332     flag aSign, bSign, zSign;
6333     int32_t aExp, bExp, zExp;
6334     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6335
6336     aSig1 = extractFloat128Frac1( a );
6337     aSig0 = extractFloat128Frac0( a );
6338     aExp = extractFloat128Exp( a );
6339     aSign = extractFloat128Sign( a );
6340     bSig1 = extractFloat128Frac1( b );
6341     bSig0 = extractFloat128Frac0( b );
6342     bExp = extractFloat128Exp( b );
6343     bSign = extractFloat128Sign( b );
6344     zSign = aSign ^ bSign;
6345     if ( aExp == 0x7FFF ) {
6346         if (    ( aSig0 | aSig1 )
6347              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6348             return propagateFloat128NaN(a, b, status);
6349         }
6350         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6351         return packFloat128( zSign, 0x7FFF, 0, 0 );
6352     }
6353     if ( bExp == 0x7FFF ) {
6354         if (bSig0 | bSig1) {
6355             return propagateFloat128NaN(a, b, status);
6356         }
6357         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6358  invalid:
6359             float_raise(float_flag_invalid, status);
6360             return float128_default_nan(status);
6361         }
6362         return packFloat128( zSign, 0x7FFF, 0, 0 );
6363     }
6364     if ( aExp == 0 ) {
6365         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6366         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6367     }
6368     if ( bExp == 0 ) {
6369         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6370         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6371     }
6372     zExp = aExp + bExp - 0x4000;
6373     aSig0 |= LIT64( 0x0001000000000000 );
6374     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6375     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6376     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6377     zSig2 |= ( zSig3 != 0 );
6378     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6379         shift128ExtraRightJamming(
6380             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6381         ++zExp;
6382     }
6383     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6384
6385 }
6386
6387 /*----------------------------------------------------------------------------
6388 | Returns the result of dividing the quadruple-precision floating-point value
6389 | `a' by the corresponding value `b'.  The operation is performed according to
6390 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6391 *----------------------------------------------------------------------------*/
6392
6393 float128 float128_div(float128 a, float128 b, float_status *status)
6394 {
6395     flag aSign, bSign, zSign;
6396     int32_t aExp, bExp, zExp;
6397     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6398     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6399
6400     aSig1 = extractFloat128Frac1( a );
6401     aSig0 = extractFloat128Frac0( a );
6402     aExp = extractFloat128Exp( a );
6403     aSign = extractFloat128Sign( a );
6404     bSig1 = extractFloat128Frac1( b );
6405     bSig0 = extractFloat128Frac0( b );
6406     bExp = extractFloat128Exp( b );
6407     bSign = extractFloat128Sign( b );
6408     zSign = aSign ^ bSign;
6409     if ( aExp == 0x7FFF ) {
6410         if (aSig0 | aSig1) {
6411             return propagateFloat128NaN(a, b, status);
6412         }
6413         if ( bExp == 0x7FFF ) {
6414             if (bSig0 | bSig1) {
6415                 return propagateFloat128NaN(a, b, status);
6416             }
6417             goto invalid;
6418         }
6419         return packFloat128( zSign, 0x7FFF, 0, 0 );
6420     }
6421     if ( bExp == 0x7FFF ) {
6422         if (bSig0 | bSig1) {
6423             return propagateFloat128NaN(a, b, status);
6424         }
6425         return packFloat128( zSign, 0, 0, 0 );
6426     }
6427     if ( bExp == 0 ) {
6428         if ( ( bSig0 | bSig1 ) == 0 ) {
6429             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6430  invalid:
6431                 float_raise(float_flag_invalid, status);
6432                 return float128_default_nan(status);
6433             }
6434             float_raise(float_flag_divbyzero, status);
6435             return packFloat128( zSign, 0x7FFF, 0, 0 );
6436         }
6437         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6438     }
6439     if ( aExp == 0 ) {
6440         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6441         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6442     }
6443     zExp = aExp - bExp + 0x3FFD;
6444     shortShift128Left(
6445         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6446     shortShift128Left(
6447         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6448     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6449         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6450         ++zExp;
6451     }
6452     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6453     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6454     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6455     while ( (int64_t) rem0 < 0 ) {
6456         --zSig0;
6457         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6458     }
6459     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6460     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6461         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6462         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6463         while ( (int64_t) rem1 < 0 ) {
6464             --zSig1;
6465             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6466         }
6467         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6468     }
6469     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6470     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6471
6472 }
6473
6474 /*----------------------------------------------------------------------------
6475 | Returns the remainder of the quadruple-precision floating-point value `a'
6476 | with respect to the corresponding value `b'.  The operation is performed
6477 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6478 *----------------------------------------------------------------------------*/
6479
6480 float128 float128_rem(float128 a, float128 b, float_status *status)
6481 {
6482     flag aSign, zSign;
6483     int32_t aExp, bExp, expDiff;
6484     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6485     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6486     int64_t sigMean0;
6487
6488     aSig1 = extractFloat128Frac1( a );
6489     aSig0 = extractFloat128Frac0( a );
6490     aExp = extractFloat128Exp( a );
6491     aSign = extractFloat128Sign( a );
6492     bSig1 = extractFloat128Frac1( b );
6493     bSig0 = extractFloat128Frac0( b );
6494     bExp = extractFloat128Exp( b );
6495     if ( aExp == 0x7FFF ) {
6496         if (    ( aSig0 | aSig1 )
6497              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6498             return propagateFloat128NaN(a, b, status);
6499         }
6500         goto invalid;
6501     }
6502     if ( bExp == 0x7FFF ) {
6503         if (bSig0 | bSig1) {
6504             return propagateFloat128NaN(a, b, status);
6505         }
6506         return a;
6507     }
6508     if ( bExp == 0 ) {
6509         if ( ( bSig0 | bSig1 ) == 0 ) {
6510  invalid:
6511             float_raise(float_flag_invalid, status);
6512             return float128_default_nan(status);
6513         }
6514         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6515     }
6516     if ( aExp == 0 ) {
6517         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6518         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6519     }
6520     expDiff = aExp - bExp;
6521     if ( expDiff < -1 ) return a;
6522     shortShift128Left(
6523         aSig0 | LIT64( 0x0001000000000000 ),
6524         aSig1,
6525         15 - ( expDiff < 0 ),
6526         &aSig0,
6527         &aSig1
6528     );
6529     shortShift128Left(
6530         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6531     q = le128( bSig0, bSig1, aSig0, aSig1 );
6532     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6533     expDiff -= 64;
6534     while ( 0 < expDiff ) {
6535         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6536         q = ( 4 < q ) ? q - 4 : 0;
6537         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6538         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6539         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6540         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6541         expDiff -= 61;
6542     }
6543     if ( -64 < expDiff ) {
6544         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6545         q = ( 4 < q ) ? q - 4 : 0;
6546         q >>= - expDiff;
6547         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6548         expDiff += 52;
6549         if ( expDiff < 0 ) {
6550             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6551         }
6552         else {
6553             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6554         }
6555         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6556         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6557     }
6558     else {
6559         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6560         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6561     }
6562     do {
6563         alternateASig0 = aSig0;
6564         alternateASig1 = aSig1;
6565         ++q;
6566         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6567     } while ( 0 <= (int64_t) aSig0 );
6568     add128(
6569         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6570     if (    ( sigMean0 < 0 )
6571          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6572         aSig0 = alternateASig0;
6573         aSig1 = alternateASig1;
6574     }
6575     zSign = ( (int64_t) aSig0 < 0 );
6576     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6577     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6578                                          status);
6579 }
6580
6581 /*----------------------------------------------------------------------------
6582 | Returns the square root of the quadruple-precision floating-point value `a'.
6583 | The operation is performed according to the IEC/IEEE Standard for Binary
6584 | Floating-Point Arithmetic.
6585 *----------------------------------------------------------------------------*/
6586
6587 float128 float128_sqrt(float128 a, float_status *status)
6588 {
6589     flag aSign;
6590     int32_t aExp, zExp;
6591     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6592     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6593
6594     aSig1 = extractFloat128Frac1( a );
6595     aSig0 = extractFloat128Frac0( a );
6596     aExp = extractFloat128Exp( a );
6597     aSign = extractFloat128Sign( a );
6598     if ( aExp == 0x7FFF ) {
6599         if (aSig0 | aSig1) {
6600             return propagateFloat128NaN(a, a, status);
6601         }
6602         if ( ! aSign ) return a;
6603         goto invalid;
6604     }
6605     if ( aSign ) {
6606         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6607  invalid:
6608         float_raise(float_flag_invalid, status);
6609         return float128_default_nan(status);
6610     }
6611     if ( aExp == 0 ) {
6612         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6613         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6614     }
6615     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6616     aSig0 |= LIT64( 0x0001000000000000 );
6617     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6618     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6619     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6620     doubleZSig0 = zSig0<<1;
6621     mul64To128( zSig0, zSig0, &term0, &term1 );
6622     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6623     while ( (int64_t) rem0 < 0 ) {
6624         --zSig0;
6625         doubleZSig0 -= 2;
6626         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6627     }
6628     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6629     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6630         if ( zSig1 == 0 ) zSig1 = 1;
6631         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6632         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6633         mul64To128( zSig1, zSig1, &term2, &term3 );
6634         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6635         while ( (int64_t) rem1 < 0 ) {
6636             --zSig1;
6637             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6638             term3 |= 1;
6639             term2 |= doubleZSig0;
6640             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6641         }
6642         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6643     }
6644     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6645     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6646
6647 }
6648
6649 /*----------------------------------------------------------------------------
6650 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6651 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6652 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6653 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6654 *----------------------------------------------------------------------------*/
6655
6656 int float128_eq(float128 a, float128 b, float_status *status)
6657 {
6658
6659     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6660               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6661          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6662               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6663        ) {
6664         float_raise(float_flag_invalid, status);
6665         return 0;
6666     }
6667     return
6668            ( a.low == b.low )
6669         && (    ( a.high == b.high )
6670              || (    ( a.low == 0 )
6671                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6672            );
6673
6674 }
6675
6676 /*----------------------------------------------------------------------------
6677 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6678 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6679 | exception is raised if either operand is a NaN.  The comparison is performed
6680 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6681 *----------------------------------------------------------------------------*/
6682
6683 int float128_le(float128 a, float128 b, float_status *status)
6684 {
6685     flag aSign, bSign;
6686
6687     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6688               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6689          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6690               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6691        ) {
6692         float_raise(float_flag_invalid, status);
6693         return 0;
6694     }
6695     aSign = extractFloat128Sign( a );
6696     bSign = extractFloat128Sign( b );
6697     if ( aSign != bSign ) {
6698         return
6699                aSign
6700             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6701                  == 0 );
6702     }
6703     return
6704           aSign ? le128( b.high, b.low, a.high, a.low )
6705         : le128( a.high, a.low, b.high, b.low );
6706
6707 }
6708
6709 /*----------------------------------------------------------------------------
6710 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6711 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6712 | raised if either operand is a NaN.  The comparison is performed according
6713 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6714 *----------------------------------------------------------------------------*/
6715
6716 int float128_lt(float128 a, float128 b, float_status *status)
6717 {
6718     flag aSign, bSign;
6719
6720     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6721               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6722          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6723               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6724        ) {
6725         float_raise(float_flag_invalid, status);
6726         return 0;
6727     }
6728     aSign = extractFloat128Sign( a );
6729     bSign = extractFloat128Sign( b );
6730     if ( aSign != bSign ) {
6731         return
6732                aSign
6733             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6734                  != 0 );
6735     }
6736     return
6737           aSign ? lt128( b.high, b.low, a.high, a.low )
6738         : lt128( a.high, a.low, b.high, b.low );
6739
6740 }
6741
6742 /*----------------------------------------------------------------------------
6743 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6744 | be compared, and 0 otherwise.  The invalid exception is raised if either
6745 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6746 | Standard for Binary Floating-Point Arithmetic.
6747 *----------------------------------------------------------------------------*/
6748
6749 int float128_unordered(float128 a, float128 b, float_status *status)
6750 {
6751     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6752               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6753          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6754               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6755        ) {
6756         float_raise(float_flag_invalid, status);
6757         return 1;
6758     }
6759     return 0;
6760 }
6761
6762 /*----------------------------------------------------------------------------
6763 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6764 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6765 | exception.  The comparison is performed according to the IEC/IEEE Standard
6766 | for Binary Floating-Point Arithmetic.
6767 *----------------------------------------------------------------------------*/
6768
6769 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6770 {
6771
6772     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6773               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6774          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6775               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6776        ) {
6777         if (float128_is_signaling_nan(a, status)
6778          || float128_is_signaling_nan(b, status)) {
6779             float_raise(float_flag_invalid, status);
6780         }
6781         return 0;
6782     }
6783     return
6784            ( a.low == b.low )
6785         && (    ( a.high == b.high )
6786              || (    ( a.low == 0 )
6787                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6788            );
6789
6790 }
6791
6792 /*----------------------------------------------------------------------------
6793 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6794 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6795 | cause an exception.  Otherwise, the comparison is performed according to the
6796 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6797 *----------------------------------------------------------------------------*/
6798
6799 int float128_le_quiet(float128 a, float128 b, float_status *status)
6800 {
6801     flag aSign, bSign;
6802
6803     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6804               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6805          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6806               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6807        ) {
6808         if (float128_is_signaling_nan(a, status)
6809          || float128_is_signaling_nan(b, status)) {
6810             float_raise(float_flag_invalid, status);
6811         }
6812         return 0;
6813     }
6814     aSign = extractFloat128Sign( a );
6815     bSign = extractFloat128Sign( b );
6816     if ( aSign != bSign ) {
6817         return
6818                aSign
6819             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6820                  == 0 );
6821     }
6822     return
6823           aSign ? le128( b.high, b.low, a.high, a.low )
6824         : le128( a.high, a.low, b.high, b.low );
6825
6826 }
6827
6828 /*----------------------------------------------------------------------------
6829 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6830 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6831 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6832 | Standard for Binary Floating-Point Arithmetic.
6833 *----------------------------------------------------------------------------*/
6834
6835 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6836 {
6837     flag aSign, bSign;
6838
6839     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6840               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6841          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6842               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6843        ) {
6844         if (float128_is_signaling_nan(a, status)
6845          || float128_is_signaling_nan(b, status)) {
6846             float_raise(float_flag_invalid, status);
6847         }
6848         return 0;
6849     }
6850     aSign = extractFloat128Sign( a );
6851     bSign = extractFloat128Sign( b );
6852     if ( aSign != bSign ) {
6853         return
6854                aSign
6855             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6856                  != 0 );
6857     }
6858     return
6859           aSign ? lt128( b.high, b.low, a.high, a.low )
6860         : lt128( a.high, a.low, b.high, b.low );
6861
6862 }
6863
6864 /*----------------------------------------------------------------------------
6865 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6866 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6867 | comparison is performed according to the IEC/IEEE Standard for Binary
6868 | Floating-Point Arithmetic.
6869 *----------------------------------------------------------------------------*/
6870
6871 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6872 {
6873     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6874               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6875          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6876               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6877        ) {
6878         if (float128_is_signaling_nan(a, status)
6879          || float128_is_signaling_nan(b, status)) {
6880             float_raise(float_flag_invalid, status);
6881         }
6882         return 1;
6883     }
6884     return 0;
6885 }
6886
6887 #define COMPARE(s, nan_exp)                                                  \
6888 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
6889                                       int is_quiet, float_status *status)    \
6890 {                                                                            \
6891     flag aSign, bSign;                                                       \
6892     uint ## s ## _t av, bv;                                                  \
6893     a = float ## s ## _squash_input_denormal(a, status);                     \
6894     b = float ## s ## _squash_input_denormal(b, status);                     \
6895                                                                              \
6896     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6897          extractFloat ## s ## Frac( a ) ) ||                                 \
6898         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6899           extractFloat ## s ## Frac( b ) )) {                                \
6900         if (!is_quiet ||                                                     \
6901             float ## s ## _is_signaling_nan(a, status) ||                  \
6902             float ## s ## _is_signaling_nan(b, status)) {                 \
6903             float_raise(float_flag_invalid, status);                         \
6904         }                                                                    \
6905         return float_relation_unordered;                                     \
6906     }                                                                        \
6907     aSign = extractFloat ## s ## Sign( a );                                  \
6908     bSign = extractFloat ## s ## Sign( b );                                  \
6909     av = float ## s ## _val(a);                                              \
6910     bv = float ## s ## _val(b);                                              \
6911     if ( aSign != bSign ) {                                                  \
6912         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6913             /* zero case */                                                  \
6914             return float_relation_equal;                                     \
6915         } else {                                                             \
6916             return 1 - (2 * aSign);                                          \
6917         }                                                                    \
6918     } else {                                                                 \
6919         if (av == bv) {                                                      \
6920             return float_relation_equal;                                     \
6921         } else {                                                             \
6922             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6923         }                                                                    \
6924     }                                                                        \
6925 }                                                                            \
6926                                                                              \
6927 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
6928 {                                                                            \
6929     return float ## s ## _compare_internal(a, b, 0, status);                 \
6930 }                                                                            \
6931                                                                              \
6932 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
6933                                  float_status *status)                       \
6934 {                                                                            \
6935     return float ## s ## _compare_internal(a, b, 1, status);                 \
6936 }
6937
6938 COMPARE(32, 0xff)
6939 COMPARE(64, 0x7ff)
6940
6941 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6942                                             int is_quiet, float_status *status)
6943 {
6944     flag aSign, bSign;
6945
6946     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6947         float_raise(float_flag_invalid, status);
6948         return float_relation_unordered;
6949     }
6950     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6951           ( extractFloatx80Frac( a )<<1 ) ) ||
6952         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6953           ( extractFloatx80Frac( b )<<1 ) )) {
6954         if (!is_quiet ||
6955             floatx80_is_signaling_nan(a, status) ||
6956             floatx80_is_signaling_nan(b, status)) {
6957             float_raise(float_flag_invalid, status);
6958         }
6959         return float_relation_unordered;
6960     }
6961     aSign = extractFloatx80Sign( a );
6962     bSign = extractFloatx80Sign( b );
6963     if ( aSign != bSign ) {
6964
6965         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6966              ( ( a.low | b.low ) == 0 ) ) {
6967             /* zero case */
6968             return float_relation_equal;
6969         } else {
6970             return 1 - (2 * aSign);
6971         }
6972     } else {
6973         if (a.low == b.low && a.high == b.high) {
6974             return float_relation_equal;
6975         } else {
6976             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6977         }
6978     }
6979 }
6980
6981 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6982 {
6983     return floatx80_compare_internal(a, b, 0, status);
6984 }
6985
6986 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6987 {
6988     return floatx80_compare_internal(a, b, 1, status);
6989 }
6990
6991 static inline int float128_compare_internal(float128 a, float128 b,
6992                                             int is_quiet, float_status *status)
6993 {
6994     flag aSign, bSign;
6995
6996     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6997           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6998         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6999           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7000         if (!is_quiet ||
7001             float128_is_signaling_nan(a, status) ||
7002             float128_is_signaling_nan(b, status)) {
7003             float_raise(float_flag_invalid, status);
7004         }
7005         return float_relation_unordered;
7006     }
7007     aSign = extractFloat128Sign( a );
7008     bSign = extractFloat128Sign( b );
7009     if ( aSign != bSign ) {
7010         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7011             /* zero case */
7012             return float_relation_equal;
7013         } else {
7014             return 1 - (2 * aSign);
7015         }
7016     } else {
7017         if (a.low == b.low && a.high == b.high) {
7018             return float_relation_equal;
7019         } else {
7020             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7021         }
7022     }
7023 }
7024
7025 int float128_compare(float128 a, float128 b, float_status *status)
7026 {
7027     return float128_compare_internal(a, b, 0, status);
7028 }
7029
7030 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7031 {
7032     return float128_compare_internal(a, b, 1, status);
7033 }
7034
7035 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7036 {
7037     flag aSign;
7038     int32_t aExp;
7039     uint64_t aSig;
7040
7041     if (floatx80_invalid_encoding(a)) {
7042         float_raise(float_flag_invalid, status);
7043         return floatx80_default_nan(status);
7044     }
7045     aSig = extractFloatx80Frac( a );
7046     aExp = extractFloatx80Exp( a );
7047     aSign = extractFloatx80Sign( a );
7048
7049     if ( aExp == 0x7FFF ) {
7050         if ( aSig<<1 ) {
7051             return propagateFloatx80NaN(a, a, status);
7052         }
7053         return a;
7054     }
7055
7056     if (aExp == 0) {
7057         if (aSig == 0) {
7058             return a;
7059         }
7060         aExp++;
7061     }
7062
7063     if (n > 0x10000) {
7064         n = 0x10000;
7065     } else if (n < -0x10000) {
7066         n = -0x10000;
7067     }
7068
7069     aExp += n;
7070     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7071                                          aSign, aExp, aSig, 0, status);
7072 }
7073
7074 float128 float128_scalbn(float128 a, int n, float_status *status)
7075 {
7076     flag aSign;
7077     int32_t aExp;
7078     uint64_t aSig0, aSig1;
7079
7080     aSig1 = extractFloat128Frac1( a );
7081     aSig0 = extractFloat128Frac0( a );
7082     aExp = extractFloat128Exp( a );
7083     aSign = extractFloat128Sign( a );
7084     if ( aExp == 0x7FFF ) {
7085         if ( aSig0 | aSig1 ) {
7086             return propagateFloat128NaN(a, a, status);
7087         }
7088         return a;
7089     }
7090     if (aExp != 0) {
7091         aSig0 |= LIT64( 0x0001000000000000 );
7092     } else if (aSig0 == 0 && aSig1 == 0) {
7093         return a;
7094     } else {
7095         aExp++;
7096     }
7097
7098     if (n > 0x10000) {
7099         n = 0x10000;
7100     } else if (n < -0x10000) {
7101         n = -0x10000;
7102     }
7103
7104     aExp += n - 1;
7105     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7106                                          , status);
7107
7108 }