fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86 #include "qemu/bitops.h"
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90
  91 /*----------------------------------------------------------------------------
  92 | Primitive arithmetic functions, including multi-word arithmetic, and
  93 | division and square root approximations.  (Can be specialized to target if
  94 | desired.)
  95 *----------------------------------------------------------------------------*/
  96 #include "fpu/softfloat-macros.h"
  97
  98 /*----------------------------------------------------------------------------
  99 | Returns the fraction bits of the half-precision floating-point value `a'.
 100 *----------------------------------------------------------------------------*/
 101
 102 static inline uint32_t extractFloat16Frac(float16 a)
 103 {
 104     return float16_val(a) & 0x3ff;
 105 }
 106
 107 /*----------------------------------------------------------------------------
 108 | Returns the exponent bits of the half-precision floating-point value `a'.
 109 *----------------------------------------------------------------------------*/
 110
 111 static inline int extractFloat16Exp(float16 a)
 112 {
 113     return (float16_val(a) >> 10) & 0x1f;
 114 }
 115
 116 /*----------------------------------------------------------------------------
 117 | Returns the sign bit of the single-precision floating-point value `a'.
 118 *----------------------------------------------------------------------------*/
 119
 120 static inline flag extractFloat16Sign(float16 a)
 121 {
 122     return float16_val(a)>>15;
 123 }
 124
 125 /*----------------------------------------------------------------------------
 126 | Returns the fraction bits of the single-precision floating-point value `a'.
 127 *----------------------------------------------------------------------------*/
 128
 129 static inline uint32_t extractFloat32Frac(float32 a)
 130 {
 131     return float32_val(a) & 0x007FFFFF;
 132 }
 133
 134 /*----------------------------------------------------------------------------
 135 | Returns the exponent bits of the single-precision floating-point value `a'.
 136 *----------------------------------------------------------------------------*/
 137
 138 static inline int extractFloat32Exp(float32 a)
 139 {
 140     return (float32_val(a) >> 23) & 0xFF;
 141 }
 142
 143 /*----------------------------------------------------------------------------
 144 | Returns the sign bit of the single-precision floating-point value `a'.
 145 *----------------------------------------------------------------------------*/
 146
 147 static inline flag extractFloat32Sign(float32 a)
 148 {
 149     return float32_val(a) >> 31;
 150 }
 151
 152 /*----------------------------------------------------------------------------
 153 | Returns the fraction bits of the double-precision floating-point value `a'.
 154 *----------------------------------------------------------------------------*/
 155
 156 static inline uint64_t extractFloat64Frac(float64 a)
 157 {
 158     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
 159 }
 160
 161 /*----------------------------------------------------------------------------
 162 | Returns the exponent bits of the double-precision floating-point value `a'.
 163 *----------------------------------------------------------------------------*/
 164
 165 static inline int extractFloat64Exp(float64 a)
 166 {
 167     return (float64_val(a) >> 52) & 0x7FF;
 168 }
 169
 170 /*----------------------------------------------------------------------------
 171 | Returns the sign bit of the double-precision floating-point value `a'.
 172 *----------------------------------------------------------------------------*/
 173
 174 static inline flag extractFloat64Sign(float64 a)
 175 {
 176     return float64_val(a) >> 63;
 177 }
 178
 179 /*
 180  * Classify a floating point number. Everything above float_class_qnan
 181  * is a NaN so cls >= float_class_qnan is any NaN.
 182  */
 183
 184 typedef enum __attribute__ ((__packed__)) {
 185     float_class_unclassified,
 186     float_class_zero,
 187     float_class_normal,
 188     float_class_inf,
 189     float_class_qnan,  /* all NaNs from here */
 190     float_class_snan,
 191     float_class_msnan, /* maybe silenced */
 192 } FloatClass;
 193
 194 /*
 195  * Structure holding all of the decomposed parts of a float. The
 196  * exponent is unbiased and the fraction is normalized. All
 197  * calculations are done with a 64 bit fraction and then rounded as
 198  * appropriate for the final format.
 199  *
 200  * Thanks to the packed FloatClass a decent compiler should be able to
 201  * fit the whole structure into registers and avoid using the stack
 202  * for parameter passing.
 203  */
 204
 205 typedef struct {
 206     uint64_t frac;
 207     int32_t  exp;
 208     FloatClass cls;
 209     bool sign;
 210 } FloatParts;
 211
 212 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 213 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 214 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
 215
 216 /* Structure holding all of the relevant parameters for a format.
 217  *   exp_size: the size of the exponent field
 218  *   exp_bias: the offset applied to the exponent field
 219  *   exp_max: the maximum normalised exponent
 220  *   frac_size: the size of the fraction field
 221  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
 222  * The following are computed based the size of fraction
 223  *   frac_lsb: least significant bit of fraction
 224  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
 225  *   round_mask/roundeven_mask: masks used for rounding
 226  */
 227 typedef struct {
 228     int exp_size;
 229     int exp_bias;
 230     int exp_max;
 231     int frac_size;
 232     int frac_shift;
 233     uint64_t frac_lsb;
 234     uint64_t frac_lsbm1;
 235     uint64_t round_mask;
 236     uint64_t roundeven_mask;
 237 } FloatFmt;
 238
 239 /* Expand fields based on the size of exponent and fraction */
 240 #define FLOAT_PARAMS(E, F)                                           \
 241     .exp_size       = E,                                             \
 242     .exp_bias       = ((1 << E) - 1) >> 1,                           \
 243     .exp_max        = (1 << E) - 1,                                  \
 244     .frac_size      = F,                                             \
 245     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
 246     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
 247     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
 248     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
 249     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
 250
 251 static const FloatFmt float16_params = {
 252     FLOAT_PARAMS(5, 10)
 253 };
 254
 255 static const FloatFmt float32_params = {
 256     FLOAT_PARAMS(8, 23)
 257 };
 258
 259 static const FloatFmt float64_params = {
 260     FLOAT_PARAMS(11, 52)
 261 };
 262
 263 /* Unpack a float to parts, but do not canonicalize.  */
 264 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
 265 {
 266     const int sign_pos = fmt.frac_size + fmt.exp_size;
 267
 268     return (FloatParts) {
 269         .cls = float_class_unclassified,
 270         .sign = extract64(raw, sign_pos, 1),
 271         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
 272         .frac = extract64(raw, 0, fmt.frac_size),
 273     };
 274 }
 275
 276 static inline FloatParts float16_unpack_raw(float16 f)
 277 {
 278     return unpack_raw(float16_params, f);
 279 }
 280
 281 static inline FloatParts float32_unpack_raw(float32 f)
 282 {
 283     return unpack_raw(float32_params, f);
 284 }
 285
 286 static inline FloatParts float64_unpack_raw(float64 f)
 287 {
 288     return unpack_raw(float64_params, f);
 289 }
 290
 291 /* Pack a float from parts, but do not canonicalize.  */
 292 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
 293 {
 294     const int sign_pos = fmt.frac_size + fmt.exp_size;
 295     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
 296     return deposit64(ret, sign_pos, 1, p.sign);
 297 }
 298
 299 static inline float16 float16_pack_raw(FloatParts p)
 300 {
 301     return make_float16(pack_raw(float16_params, p));
 302 }
 303
 304 static inline float32 float32_pack_raw(FloatParts p)
 305 {
 306     return make_float32(pack_raw(float32_params, p));
 307 }
 308
 309 static inline float64 float64_pack_raw(FloatParts p)
 310 {
 311     return make_float64(pack_raw(float64_params, p));
 312 }
 313
 314 /*----------------------------------------------------------------------------
 315 | Functions and definitions to determine:  (1) whether tininess for underflow
 316 | is detected before or after rounding by default, (2) what (if anything)
 317 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 318 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 319 | are propagated from function inputs to output.  These details are target-
 320 | specific.
 321 *----------------------------------------------------------------------------*/
 322 #include "softfloat-specialize.h"
 323
 324 /* Canonicalize EXP and FRAC, setting CLS.  */
 325 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
 326                                float_status *status)
 327 {
 328     if (part.exp == parm->exp_max) {
 329         if (part.frac == 0) {
 330             part.cls = float_class_inf;
 331         } else {
 332             part.frac <<= parm->frac_shift;
 333             part.cls = (parts_is_snan_frac(part.frac, status)
 334                         ? float_class_snan : float_class_qnan);
 335         }
 336     } else if (part.exp == 0) {
 337         if (likely(part.frac == 0)) {
 338             part.cls = float_class_zero;
 339         } else if (status->flush_inputs_to_zero) {
 340             float_raise(float_flag_input_denormal, status);
 341             part.cls = float_class_zero;
 342             part.frac = 0;
 343         } else {
 344             int shift = clz64(part.frac) - 1;
 345             part.cls = float_class_normal;
 346             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
 347             part.frac <<= shift;
 348         }
 349     } else {
 350         part.cls = float_class_normal;
 351         part.exp -= parm->exp_bias;
 352         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
 353     }
 354     return part;
 355 }
 356
 357 /* Round and uncanonicalize a floating-point number by parts. There
 358  * are FRAC_SHIFT bits that may require rounding at the bottom of the
 359  * fraction; these bits will be removed. The exponent will be biased
 360  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
 361  */
 362
 363 static FloatParts round_canonical(FloatParts p, float_status *s,
 364                                   const FloatFmt *parm)
 365 {
 366     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
 367     const uint64_t round_mask = parm->round_mask;
 368     const uint64_t roundeven_mask = parm->roundeven_mask;
 369     const int exp_max = parm->exp_max;
 370     const int frac_shift = parm->frac_shift;
 371     uint64_t frac, inc;
 372     int exp, flags = 0;
 373     bool overflow_norm;
 374
 375     frac = p.frac;
 376     exp = p.exp;
 377
 378     switch (p.cls) {
 379     case float_class_normal:
 380         switch (s->float_rounding_mode) {
 381         case float_round_nearest_even:
 382             overflow_norm = false;
 383             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
 384             break;
 385         case float_round_ties_away:
 386             overflow_norm = false;
 387             inc = frac_lsbm1;
 388             break;
 389         case float_round_to_zero:
 390             overflow_norm = true;
 391             inc = 0;
 392             break;
 393         case float_round_up:
 394             inc = p.sign ? 0 : round_mask;
 395             overflow_norm = p.sign;
 396             break;
 397         case float_round_down:
 398             inc = p.sign ? round_mask : 0;
 399             overflow_norm = !p.sign;
 400             break;
 401         default:
 402             g_assert_not_reached();
 403         }
 404
 405         exp += parm->exp_bias;
 406         if (likely(exp > 0)) {
 407             if (frac & round_mask) {
 408                 flags |= float_flag_inexact;
 409                 frac += inc;
 410                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
 411                     frac >>= 1;
 412                     exp++;
 413                 }
 414             }
 415             frac >>= frac_shift;
 416
 417             if (unlikely(exp >= exp_max)) {
 418                 flags |= float_flag_overflow | float_flag_inexact;
 419                 if (overflow_norm) {
 420                     exp = exp_max - 1;
 421                     frac = -1;
 422                 } else {
 423                     p.cls = float_class_inf;
 424                     goto do_inf;
 425                 }
 426             }
 427         } else if (s->flush_to_zero) {
 428             flags |= float_flag_output_denormal;
 429             p.cls = float_class_zero;
 430             goto do_zero;
 431         } else {
 432             bool is_tiny = (s->float_detect_tininess
 433                             == float_tininess_before_rounding)
 434                         || (exp < 0)
 435                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
 436
 437             shift64RightJamming(frac, 1 - exp, &frac);
 438             if (frac & round_mask) {
 439                 /* Need to recompute round-to-even.  */
 440                 if (s->float_rounding_mode == float_round_nearest_even) {
 441                     inc = ((frac & roundeven_mask) != frac_lsbm1
 442                            ? frac_lsbm1 : 0);
 443                 }
 444                 flags |= float_flag_inexact;
 445                 frac += inc;
 446             }
 447
 448             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
 449             frac >>= frac_shift;
 450
 451             if (is_tiny && (flags & float_flag_inexact)) {
 452                 flags |= float_flag_underflow;
 453             }
 454             if (exp == 0 && frac == 0) {
 455                 p.cls = float_class_zero;
 456             }
 457         }
 458         break;
 459
 460     case float_class_zero:
 461     do_zero:
 462         exp = 0;
 463         frac = 0;
 464         break;
 465
 466     case float_class_inf:
 467     do_inf:
 468         exp = exp_max;
 469         frac = 0;
 470         break;
 471
 472     case float_class_qnan:
 473     case float_class_snan:
 474         exp = exp_max;
 475         frac >>= parm->frac_shift;
 476         break;
 477
 478     default:
 479         g_assert_not_reached();
 480     }
 481
 482     float_raise(flags, s);
 483     p.exp = exp;
 484     p.frac = frac;
 485     return p;
 486 }
 487
 488 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
 489 {
 490     return canonicalize(float16_unpack_raw(f), &float16_params, s);
 491 }
 492
 493 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 494 {
 495     switch (p.cls) {
 496     case float_class_msnan:
 497         p.frac >>= float16_params.frac_shift;
 498         return float16_maybe_silence_nan(float16_pack_raw(p), s);
 499     default:
 500         p = round_canonical(p, s, &float16_params);
 501         return float16_pack_raw(p);
 502     }
 503 }
 504
 505 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 506 {
 507     return canonicalize(float32_unpack_raw(f), &float32_params, s);
 508 }
 509
 510 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 511 {
 512     switch (p.cls) {
 513     case float_class_msnan:
 514         p.frac >>= float32_params.frac_shift;
 515         return float32_maybe_silence_nan(float32_pack_raw(p), s);
 516     default:
 517         p = round_canonical(p, s, &float32_params);
 518         return float32_pack_raw(p);
 519     }
 520 }
 521
 522 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 523 {
 524     return canonicalize(float64_unpack_raw(f), &float64_params, s);
 525 }
 526
 527 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
 528 {
 529     switch (p.cls) {
 530     case float_class_msnan:
 531         p.frac >>= float64_params.frac_shift;
 532         return float64_maybe_silence_nan(float64_pack_raw(p), s);
 533     default:
 534         p = round_canonical(p, s, &float64_params);
 535         return float64_pack_raw(p);
 536     }
 537 }
 538
 539 /* Simple helpers for checking if what NaN we have */
 540 static bool is_nan(FloatClass c)
 541 {
 542     return unlikely(c >= float_class_qnan);
 543 }
 544 static bool is_snan(FloatClass c)
 545 {
 546     return c == float_class_snan;
 547 }
 548 static bool is_qnan(FloatClass c)
 549 {
 550     return c == float_class_qnan;
 551 }
 552
 553 static FloatParts return_nan(FloatParts a, float_status *s)
 554 {
 555     switch (a.cls) {
 556     case float_class_snan:
 557         s->float_exception_flags |= float_flag_invalid;
 558         a.cls = float_class_msnan;
 559         /* fall through */
 560     case float_class_qnan:
 561         if (s->default_nan_mode) {
 562             return parts_default_nan(s);
 563         }
 564         break;
 565
 566     default:
 567         g_assert_not_reached();
 568     }
 569     return a;
 570 }
 571
 572 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
 573 {
 574     if (is_snan(a.cls) || is_snan(b.cls)) {
 575         s->float_exception_flags |= float_flag_invalid;
 576     }
 577
 578     if (s->default_nan_mode) {
 579         return parts_default_nan(s);
 580     } else {
 581         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
 582                     is_qnan(b.cls), is_snan(b.cls),
 583                     a.frac > b.frac ||
 584                     (a.frac == b.frac && a.sign < b.sign))) {
 585             a = b;
 586         }
 587         a.cls = float_class_msnan;
 588     }
 589     return a;
 590 }
 591
 592 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
 593                                   bool inf_zero, float_status *s)
 594 {
 595     int which;
 596
 597     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
 598         s->float_exception_flags |= float_flag_invalid;
 599     }
 600
 601     which = pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
 602                           is_qnan(b.cls), is_snan(b.cls),
 603                           is_qnan(c.cls), is_snan(c.cls),
 604                           inf_zero, s);
 605
 606     if (s->default_nan_mode) {
 607         /* Note that this check is after pickNaNMulAdd so that function
 608          * has an opportunity to set the Invalid flag.
 609          */
 610         which = 3;
 611     }
 612
 613     switch (which) {
 614     case 0:
 615         break;
 616     case 1:
 617         a = b;
 618         break;
 619     case 2:
 620         a = c;
 621         break;
 622     case 3:
 623         return parts_default_nan(s);
 624     default:
 625         g_assert_not_reached();
 626     }
 627     a.cls = float_class_msnan;
 628
 629     return a;
 630 }
 631
 632 /*
 633  * Returns the result of adding or subtracting the values of the
 634  * floating-point values `a' and `b'. The operation is performed
 635  * according to the IEC/IEEE Standard for Binary Floating-Point
 636  * Arithmetic.
 637  */
 638
 639 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
 640                                 float_status *s)
 641 {
 642     bool a_sign = a.sign;
 643     bool b_sign = b.sign ^ subtract;
 644
 645     if (a_sign != b_sign) {
 646         /* Subtraction */
 647
 648         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 649             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
 650                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 651                 a.frac = a.frac - b.frac;
 652             } else {
 653                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 654                 a.frac = b.frac - a.frac;
 655                 a.exp = b.exp;
 656                 a_sign ^= 1;
 657             }
 658
 659             if (a.frac == 0) {
 660                 a.cls = float_class_zero;
 661                 a.sign = s->float_rounding_mode == float_round_down;
 662             } else {
 663                 int shift = clz64(a.frac) - 1;
 664                 a.frac = a.frac << shift;
 665                 a.exp = a.exp - shift;
 666                 a.sign = a_sign;
 667             }
 668             return a;
 669         }
 670         if (is_nan(a.cls) || is_nan(b.cls)) {
 671             return pick_nan(a, b, s);
 672         }
 673         if (a.cls == float_class_inf) {
 674             if (b.cls == float_class_inf) {
 675                 float_raise(float_flag_invalid, s);
 676                 return parts_default_nan(s);
 677             }
 678             return a;
 679         }
 680         if (a.cls == float_class_zero && b.cls == float_class_zero) {
 681             a.sign = s->float_rounding_mode == float_round_down;
 682             return a;
 683         }
 684         if (a.cls == float_class_zero || b.cls == float_class_inf) {
 685             b.sign = a_sign ^ 1;
 686             return b;
 687         }
 688         if (b.cls == float_class_zero) {
 689             return a;
 690         }
 691     } else {
 692         /* Addition */
 693         if (a.cls == float_class_normal && b.cls == float_class_normal) {
 694             if (a.exp > b.exp) {
 695                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
 696             } else if (a.exp < b.exp) {
 697                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
 698                 a.exp = b.exp;
 699             }
 700             a.frac += b.frac;
 701             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
 702                 a.frac >>= 1;
 703                 a.exp += 1;
 704             }
 705             return a;
 706         }
 707         if (is_nan(a.cls) || is_nan(b.cls)) {
 708             return pick_nan(a, b, s);
 709         }
 710         if (a.cls == float_class_inf || b.cls == float_class_zero) {
 711             return a;
 712         }
 713         if (b.cls == float_class_inf || a.cls == float_class_zero) {
 714             b.sign = b_sign;
 715             return b;
 716         }
 717     }
 718     g_assert_not_reached();
 719 }
 720
 721 /*
 722  * Returns the result of adding or subtracting the floating-point
 723  * values `a' and `b'. The operation is performed according to the
 724  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 725  */
 726
 727 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
 728                                               float_status *status)
 729 {
 730     FloatParts pa = float16_unpack_canonical(a, status);
 731     FloatParts pb = float16_unpack_canonical(b, status);
 732     FloatParts pr = addsub_floats(pa, pb, false, status);
 733
 734     return float16_round_pack_canonical(pr, status);
 735 }
 736
 737 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
 738                                              float_status *status)
 739 {
 740     FloatParts pa = float32_unpack_canonical(a, status);
 741     FloatParts pb = float32_unpack_canonical(b, status);
 742     FloatParts pr = addsub_floats(pa, pb, false, status);
 743
 744     return float32_round_pack_canonical(pr, status);
 745 }
 746
 747 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
 748                                              float_status *status)
 749 {
 750     FloatParts pa = float64_unpack_canonical(a, status);
 751     FloatParts pb = float64_unpack_canonical(b, status);
 752     FloatParts pr = addsub_floats(pa, pb, false, status);
 753
 754     return float64_round_pack_canonical(pr, status);
 755 }
 756
 757 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
 758                                              float_status *status)
 759 {
 760     FloatParts pa = float16_unpack_canonical(a, status);
 761     FloatParts pb = float16_unpack_canonical(b, status);
 762     FloatParts pr = addsub_floats(pa, pb, true, status);
 763
 764     return float16_round_pack_canonical(pr, status);
 765 }
 766
 767 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
 768                                              float_status *status)
 769 {
 770     FloatParts pa = float32_unpack_canonical(a, status);
 771     FloatParts pb = float32_unpack_canonical(b, status);
 772     FloatParts pr = addsub_floats(pa, pb, true, status);
 773
 774     return float32_round_pack_canonical(pr, status);
 775 }
 776
 777 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
 778                                              float_status *status)
 779 {
 780     FloatParts pa = float64_unpack_canonical(a, status);
 781     FloatParts pb = float64_unpack_canonical(b, status);
 782     FloatParts pr = addsub_floats(pa, pb, true, status);
 783
 784     return float64_round_pack_canonical(pr, status);
 785 }
 786
 787 /*
 788  * Returns the result of multiplying the floating-point values `a' and
 789  * `b'. The operation is performed according to the IEC/IEEE Standard
 790  * for Binary Floating-Point Arithmetic.
 791  */
 792
 793 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
 794 {
 795     bool sign = a.sign ^ b.sign;
 796
 797     if (a.cls == float_class_normal && b.cls == float_class_normal) {
 798         uint64_t hi, lo;
 799         int exp = a.exp + b.exp;
 800
 801         mul64To128(a.frac, b.frac, &hi, &lo);
 802         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 803         if (lo & DECOMPOSED_OVERFLOW_BIT) {
 804             shift64RightJamming(lo, 1, &lo);
 805             exp += 1;
 806         }
 807
 808         /* Re-use a */
 809         a.exp = exp;
 810         a.sign = sign;
 811         a.frac = lo;
 812         return a;
 813     }
 814     /* handle all the NaN cases */
 815     if (is_nan(a.cls) || is_nan(b.cls)) {
 816         return pick_nan(a, b, s);
 817     }
 818     /* Inf * Zero == NaN */
 819     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
 820         (a.cls == float_class_zero && b.cls == float_class_inf)) {
 821         s->float_exception_flags |= float_flag_invalid;
 822         return parts_default_nan(s);
 823     }
 824     /* Multiply by 0 or Inf */
 825     if (a.cls == float_class_inf || a.cls == float_class_zero) {
 826         a.sign = sign;
 827         return a;
 828     }
 829     if (b.cls == float_class_inf || b.cls == float_class_zero) {
 830         b.sign = sign;
 831         return b;
 832     }
 833     g_assert_not_reached();
 834 }
 835
 836 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
 837                                              float_status *status)
 838 {
 839     FloatParts pa = float16_unpack_canonical(a, status);
 840     FloatParts pb = float16_unpack_canonical(b, status);
 841     FloatParts pr = mul_floats(pa, pb, status);
 842
 843     return float16_round_pack_canonical(pr, status);
 844 }
 845
 846 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
 847                                              float_status *status)
 848 {
 849     FloatParts pa = float32_unpack_canonical(a, status);
 850     FloatParts pb = float32_unpack_canonical(b, status);
 851     FloatParts pr = mul_floats(pa, pb, status);
 852
 853     return float32_round_pack_canonical(pr, status);
 854 }
 855
 856 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
 857                                              float_status *status)
 858 {
 859     FloatParts pa = float64_unpack_canonical(a, status);
 860     FloatParts pb = float64_unpack_canonical(b, status);
 861     FloatParts pr = mul_floats(pa, pb, status);
 862
 863     return float64_round_pack_canonical(pr, status);
 864 }
 865
 866 /*
 867  * Returns the result of multiplying the floating-point values `a' and
 868  * `b' then adding 'c', with no intermediate rounding step after the
 869  * multiplication. The operation is performed according to the
 870  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
 871  * The flags argument allows the caller to select negation of the
 872  * addend, the intermediate product, or the final result. (The
 873  * difference between this and having the caller do a separate
 874  * negation is that negating externally will flip the sign bit on
 875  * NaNs.)
 876  */
 877
 878 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
 879                                 int flags, float_status *s)
 880 {
 881     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
 882                     ((1 << float_class_inf) | (1 << float_class_zero));
 883     bool p_sign;
 884     bool sign_flip = flags & float_muladd_negate_result;
 885     FloatClass p_class;
 886     uint64_t hi, lo;
 887     int p_exp;
 888
 889     /* It is implementation-defined whether the cases of (0,inf,qnan)
 890      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
 891      * they return if they do), so we have to hand this information
 892      * off to the target-specific pick-a-NaN routine.
 893      */
 894     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
 895         return pick_nan_muladd(a, b, c, inf_zero, s);
 896     }
 897
 898     if (inf_zero) {
 899         s->float_exception_flags |= float_flag_invalid;
 900         return parts_default_nan(s);
 901     }
 902
 903     if (flags & float_muladd_negate_c) {
 904         c.sign ^= 1;
 905     }
 906
 907     p_sign = a.sign ^ b.sign;
 908
 909     if (flags & float_muladd_negate_product) {
 910         p_sign ^= 1;
 911     }
 912
 913     if (a.cls == float_class_inf || b.cls == float_class_inf) {
 914         p_class = float_class_inf;
 915     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
 916         p_class = float_class_zero;
 917     } else {
 918         p_class = float_class_normal;
 919     }
 920
 921     if (c.cls == float_class_inf) {
 922         if (p_class == float_class_inf && p_sign != c.sign) {
 923             s->float_exception_flags |= float_flag_invalid;
 924             return parts_default_nan(s);
 925         } else {
 926             a.cls = float_class_inf;
 927             a.sign = c.sign ^ sign_flip;
 928             return a;
 929         }
 930     }
 931
 932     if (p_class == float_class_inf) {
 933         a.cls = float_class_inf;
 934         a.sign = p_sign ^ sign_flip;
 935         return a;
 936     }
 937
 938     if (p_class == float_class_zero) {
 939         if (c.cls == float_class_zero) {
 940             if (p_sign != c.sign) {
 941                 p_sign = s->float_rounding_mode == float_round_down;
 942             }
 943             c.sign = p_sign;
 944         } else if (flags & float_muladd_halve_result) {
 945             c.exp -= 1;
 946         }
 947         c.sign ^= sign_flip;
 948         return c;
 949     }
 950
 951     /* a & b should be normals now... */
 952     assert(a.cls == float_class_normal &&
 953            b.cls == float_class_normal);
 954
 955     p_exp = a.exp + b.exp;
 956
 957     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
 958      * result.
 959      */
 960     mul64To128(a.frac, b.frac, &hi, &lo);
 961     /* binary point now at bit 124 */
 962
 963     /* check for overflow */
 964     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
 965         shift128RightJamming(hi, lo, 1, &hi, &lo);
 966         p_exp += 1;
 967     }
 968
 969     /* + add/sub */
 970     if (c.cls == float_class_zero) {
 971         /* move binary point back to 62 */
 972         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 973     } else {
 974         int exp_diff = p_exp - c.exp;
 975         if (p_sign == c.sign) {
 976             /* Addition */
 977             if (exp_diff <= 0) {
 978                 shift128RightJamming(hi, lo,
 979                                      DECOMPOSED_BINARY_POINT - exp_diff,
 980                                      &hi, &lo);
 981                 lo += c.frac;
 982                 p_exp = c.exp;
 983             } else {
 984                 uint64_t c_hi, c_lo;
 985                 /* shift c to the same binary point as the product (124) */
 986                 c_hi = c.frac >> 2;
 987                 c_lo = 0;
 988                 shift128RightJamming(c_hi, c_lo,
 989                                      exp_diff,
 990                                      &c_hi, &c_lo);
 991                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
 992                 /* move binary point back to 62 */
 993                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
 994             }
 995
 996             if (lo & DECOMPOSED_OVERFLOW_BIT) {
 997                 shift64RightJamming(lo, 1, &lo);
 998                 p_exp += 1;
 999             }
1000
1001         } else {
1002             /* Subtraction */
1003             uint64_t c_hi, c_lo;
1004             /* make C binary point match product at bit 124 */
1005             c_hi = c.frac >> 2;
1006             c_lo = 0;
1007
1008             if (exp_diff <= 0) {
1009                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1010                 if (exp_diff == 0
1011                     &&
1012                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1013                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1014                 } else {
1015                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1016                     p_sign ^= 1;
1017                     p_exp = c.exp;
1018                 }
1019             } else {
1020                 shift128RightJamming(c_hi, c_lo,
1021                                      exp_diff,
1022                                      &c_hi, &c_lo);
1023                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1024             }
1025
1026             if (hi == 0 && lo == 0) {
1027                 a.cls = float_class_zero;
1028                 a.sign = s->float_rounding_mode == float_round_down;
1029                 a.sign ^= sign_flip;
1030                 return a;
1031             } else {
1032                 int shift;
1033                 if (hi != 0) {
1034                     shift = clz64(hi);
1035                 } else {
1036                     shift = clz64(lo) + 64;
1037                 }
1038                 /* Normalizing to a binary point of 124 is the
1039                    correct adjust for the exponent.  However since we're
1040                    shifting, we might as well put the binary point back
1041                    at 62 where we really want it.  Therefore shift as
1042                    if we're leaving 1 bit at the top of the word, but
1043                    adjust the exponent as if we're leaving 3 bits.  */
1044                 shift -= 1;
1045                 if (shift >= 64) {
1046                     lo = lo << (shift - 64);
1047                 } else {
1048                     hi = (hi << shift) | (lo >> (64 - shift));
1049                     lo = hi | ((lo << shift) != 0);
1050                 }
1051                 p_exp -= shift - 2;
1052             }
1053         }
1054     }
1055
1056     if (flags & float_muladd_halve_result) {
1057         p_exp -= 1;
1058     }
1059
1060     /* finally prepare our result */
1061     a.cls = float_class_normal;
1062     a.sign = p_sign ^ sign_flip;
1063     a.exp = p_exp;
1064     a.frac = lo;
1065
1066     return a;
1067 }
1068
1069 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1070                                                 int flags, float_status *status)
1071 {
1072     FloatParts pa = float16_unpack_canonical(a, status);
1073     FloatParts pb = float16_unpack_canonical(b, status);
1074     FloatParts pc = float16_unpack_canonical(c, status);
1075     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1076
1077     return float16_round_pack_canonical(pr, status);
1078 }
1079
1080 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1081                                                 int flags, float_status *status)
1082 {
1083     FloatParts pa = float32_unpack_canonical(a, status);
1084     FloatParts pb = float32_unpack_canonical(b, status);
1085     FloatParts pc = float32_unpack_canonical(c, status);
1086     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1087
1088     return float32_round_pack_canonical(pr, status);
1089 }
1090
1091 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1092                                                 int flags, float_status *status)
1093 {
1094     FloatParts pa = float64_unpack_canonical(a, status);
1095     FloatParts pb = float64_unpack_canonical(b, status);
1096     FloatParts pc = float64_unpack_canonical(c, status);
1097     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1098
1099     return float64_round_pack_canonical(pr, status);
1100 }
1101
1102 /*
1103  * Returns the result of dividing the floating-point value `a' by the
1104  * corresponding value `b'. The operation is performed according to
1105  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1106  */
1107
1108 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1109 {
1110     bool sign = a.sign ^ b.sign;
1111
1112     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1113         uint64_t temp_lo, temp_hi;
1114         int exp = a.exp - b.exp;
1115         if (a.frac < b.frac) {
1116             exp -= 1;
1117             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1118                               &temp_hi, &temp_lo);
1119         } else {
1120             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1121                               &temp_hi, &temp_lo);
1122         }
1123         /* LSB of quot is set if inexact which roundandpack will use
1124          * to set flags. Yet again we re-use a for the result */
1125         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1126         a.sign = sign;
1127         a.exp = exp;
1128         return a;
1129     }
1130     /* handle all the NaN cases */
1131     if (is_nan(a.cls) || is_nan(b.cls)) {
1132         return pick_nan(a, b, s);
1133     }
1134     /* 0/0 or Inf/Inf */
1135     if (a.cls == b.cls
1136         &&
1137         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1138         s->float_exception_flags |= float_flag_invalid;
1139         return parts_default_nan(s);
1140     }
1141     /* Inf / x or 0 / x */
1142     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1143         a.sign = sign;
1144         return a;
1145     }
1146     /* Div 0 => Inf */
1147     if (b.cls == float_class_zero) {
1148         s->float_exception_flags |= float_flag_divbyzero;
1149         a.cls = float_class_inf;
1150         a.sign = sign;
1151         return a;
1152     }
1153     /* Div by Inf */
1154     if (b.cls == float_class_inf) {
1155         a.cls = float_class_zero;
1156         a.sign = sign;
1157         return a;
1158     }
1159     g_assert_not_reached();
1160 }
1161
1162 float16 float16_div(float16 a, float16 b, float_status *status)
1163 {
1164     FloatParts pa = float16_unpack_canonical(a, status);
1165     FloatParts pb = float16_unpack_canonical(b, status);
1166     FloatParts pr = div_floats(pa, pb, status);
1167
1168     return float16_round_pack_canonical(pr, status);
1169 }
1170
1171 float32 float32_div(float32 a, float32 b, float_status *status)
1172 {
1173     FloatParts pa = float32_unpack_canonical(a, status);
1174     FloatParts pb = float32_unpack_canonical(b, status);
1175     FloatParts pr = div_floats(pa, pb, status);
1176
1177     return float32_round_pack_canonical(pr, status);
1178 }
1179
1180 float64 float64_div(float64 a, float64 b, float_status *status)
1181 {
1182     FloatParts pa = float64_unpack_canonical(a, status);
1183     FloatParts pb = float64_unpack_canonical(b, status);
1184     FloatParts pr = div_floats(pa, pb, status);
1185
1186     return float64_round_pack_canonical(pr, status);
1187 }
1188
1189 /*
1190  * Rounds the floating-point value `a' to an integer, and returns the
1191  * result as a floating-point value. The operation is performed
1192  * according to the IEC/IEEE Standard for Binary Floating-Point
1193  * Arithmetic.
1194  */
1195
1196 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1197 {
1198     if (is_nan(a.cls)) {
1199         return return_nan(a, s);
1200     }
1201
1202     switch (a.cls) {
1203     case float_class_zero:
1204     case float_class_inf:
1205     case float_class_qnan:
1206         /* already "integral" */
1207         break;
1208     case float_class_normal:
1209         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1210             /* already integral */
1211             break;
1212         }
1213         if (a.exp < 0) {
1214             bool one;
1215             /* all fractional */
1216             s->float_exception_flags |= float_flag_inexact;
1217             switch (rounding_mode) {
1218             case float_round_nearest_even:
1219                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1220                 break;
1221             case float_round_ties_away:
1222                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1223                 break;
1224             case float_round_to_zero:
1225                 one = false;
1226                 break;
1227             case float_round_up:
1228                 one = !a.sign;
1229                 break;
1230             case float_round_down:
1231                 one = a.sign;
1232                 break;
1233             default:
1234                 g_assert_not_reached();
1235             }
1236
1237             if (one) {
1238                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1239                 a.exp = 0;
1240             } else {
1241                 a.cls = float_class_zero;
1242             }
1243         } else {
1244             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1245             uint64_t frac_lsbm1 = frac_lsb >> 1;
1246             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1247             uint64_t rnd_mask = rnd_even_mask >> 1;
1248             uint64_t inc;
1249
1250             switch (rounding_mode) {
1251             case float_round_nearest_even:
1252                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1253                 break;
1254             case float_round_ties_away:
1255                 inc = frac_lsbm1;
1256                 break;
1257             case float_round_to_zero:
1258                 inc = 0;
1259                 break;
1260             case float_round_up:
1261                 inc = a.sign ? 0 : rnd_mask;
1262                 break;
1263             case float_round_down:
1264                 inc = a.sign ? rnd_mask : 0;
1265                 break;
1266             default:
1267                 g_assert_not_reached();
1268             }
1269
1270             if (a.frac & rnd_mask) {
1271                 s->float_exception_flags |= float_flag_inexact;
1272                 a.frac += inc;
1273                 a.frac &= ~rnd_mask;
1274                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1275                     a.frac >>= 1;
1276                     a.exp++;
1277                 }
1278             }
1279         }
1280         break;
1281     default:
1282         g_assert_not_reached();
1283     }
1284     return a;
1285 }
1286
1287 float16 float16_round_to_int(float16 a, float_status *s)
1288 {
1289     FloatParts pa = float16_unpack_canonical(a, s);
1290     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1291     return float16_round_pack_canonical(pr, s);
1292 }
1293
1294 float32 float32_round_to_int(float32 a, float_status *s)
1295 {
1296     FloatParts pa = float32_unpack_canonical(a, s);
1297     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1298     return float32_round_pack_canonical(pr, s);
1299 }
1300
1301 float64 float64_round_to_int(float64 a, float_status *s)
1302 {
1303     FloatParts pa = float64_unpack_canonical(a, s);
1304     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1305     return float64_round_pack_canonical(pr, s);
1306 }
1307
1308 float64 float64_trunc_to_int(float64 a, float_status *s)
1309 {
1310     FloatParts pa = float64_unpack_canonical(a, s);
1311     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1312     return float64_round_pack_canonical(pr, s);
1313 }
1314
1315 /*
1316  * Returns the result of converting the floating-point value `a' to
1317  * the two's complement integer format. The conversion is performed
1318  * according to the IEC/IEEE Standard for Binary Floating-Point
1319  * Arithmetic---which means in particular that the conversion is
1320  * rounded according to the current rounding mode. If `a' is a NaN,
1321  * the largest positive integer is returned. Otherwise, if the
1322  * conversion overflows, the largest integer with the same sign as `a'
1323  * is returned.
1324 */
1325
1326 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1327                                      int64_t min, int64_t max,
1328                                      float_status *s)
1329 {
1330     uint64_t r;
1331     int orig_flags = get_float_exception_flags(s);
1332     FloatParts p = round_to_int(in, rmode, s);
1333
1334     switch (p.cls) {
1335     case float_class_snan:
1336     case float_class_qnan:
1337     case float_class_msnan:
1338         s->float_exception_flags = orig_flags | float_flag_invalid;
1339         return max;
1340     case float_class_inf:
1341         s->float_exception_flags = orig_flags | float_flag_invalid;
1342         return p.sign ? min : max;
1343     case float_class_zero:
1344         return 0;
1345     case float_class_normal:
1346         if (p.exp < DECOMPOSED_BINARY_POINT) {
1347             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1348         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1349             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1350         } else {
1351             r = UINT64_MAX;
1352         }
1353         if (p.sign) {
1354             if (r <= -(uint64_t) min) {
1355                 return -r;
1356             } else {
1357                 s->float_exception_flags = orig_flags | float_flag_invalid;
1358                 return min;
1359             }
1360         } else {
1361             if (r <= max) {
1362                 return r;
1363             } else {
1364                 s->float_exception_flags = orig_flags | float_flag_invalid;
1365                 return max;
1366             }
1367         }
1368     default:
1369         g_assert_not_reached();
1370     }
1371 }
1372
1373 #define FLOAT_TO_INT(fsz, isz)                                          \
1374 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1375                                                 float_status *s)        \
1376 {                                                                       \
1377     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1378     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1379                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1380                                  s);                                    \
1381 }                                                                       \
1382                                                                         \
1383 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1384  (float ## fsz a, float_status *s)                                      \
1385 {                                                                       \
1386     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1387     return round_to_int_and_pack(p, float_round_to_zero,                \
1388                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1389                                  s);                                    \
1390 }
1391
1392 FLOAT_TO_INT(16, 16)
1393 FLOAT_TO_INT(16, 32)
1394 FLOAT_TO_INT(16, 64)
1395
1396 FLOAT_TO_INT(32, 16)
1397 FLOAT_TO_INT(32, 32)
1398 FLOAT_TO_INT(32, 64)
1399
1400 FLOAT_TO_INT(64, 16)
1401 FLOAT_TO_INT(64, 32)
1402 FLOAT_TO_INT(64, 64)
1403
1404 #undef FLOAT_TO_INT
1405
1406 /*
1407  *  Returns the result of converting the floating-point value `a' to
1408  *  the unsigned integer format. The conversion is performed according
1409  *  to the IEC/IEEE Standard for Binary Floating-Point
1410  *  Arithmetic---which means in particular that the conversion is
1411  *  rounded according to the current rounding mode. If `a' is a NaN,
1412  *  the largest unsigned integer is returned. Otherwise, if the
1413  *  conversion overflows, the largest unsigned integer is returned. If
1414  *  the 'a' is negative, the result is rounded and zero is returned;
1415  *  values that do not round to zero will raise the inexact exception
1416  *  flag.
1417  */
1418
1419 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1420                                        float_status *s)
1421 {
1422     int orig_flags = get_float_exception_flags(s);
1423     FloatParts p = round_to_int(in, rmode, s);
1424
1425     switch (p.cls) {
1426     case float_class_snan:
1427     case float_class_qnan:
1428     case float_class_msnan:
1429         s->float_exception_flags = orig_flags | float_flag_invalid;
1430         return max;
1431     case float_class_inf:
1432         s->float_exception_flags = orig_flags | float_flag_invalid;
1433         return p.sign ? 0 : max;
1434     case float_class_zero:
1435         return 0;
1436     case float_class_normal:
1437     {
1438         uint64_t r;
1439         if (p.sign) {
1440             s->float_exception_flags = orig_flags | float_flag_invalid;
1441             return 0;
1442         }
1443
1444         if (p.exp < DECOMPOSED_BINARY_POINT) {
1445             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1446         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1447             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1448         } else {
1449             s->float_exception_flags = orig_flags | float_flag_invalid;
1450             return max;
1451         }
1452
1453         /* For uint64 this will never trip, but if p.exp is too large
1454          * to shift a decomposed fraction we shall have exited via the
1455          * 3rd leg above.
1456          */
1457         if (r > max) {
1458             s->float_exception_flags = orig_flags | float_flag_invalid;
1459             return max;
1460         } else {
1461             return r;
1462         }
1463     }
1464     default:
1465         g_assert_not_reached();
1466     }
1467 }
1468
1469 #define FLOAT_TO_UINT(fsz, isz) \
1470 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1471                                                   float_status *s)      \
1472 {                                                                       \
1473     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1474     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1475                                  UINT ## isz ## _MAX, s);               \
1476 }                                                                       \
1477                                                                         \
1478 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1479  (float ## fsz a, float_status *s)                                      \
1480 {                                                                       \
1481     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1482     return round_to_uint_and_pack(p, float_round_to_zero,               \
1483                                   UINT ## isz ## _MAX, s);              \
1484 }
1485
1486 FLOAT_TO_UINT(16, 16)
1487 FLOAT_TO_UINT(16, 32)
1488 FLOAT_TO_UINT(16, 64)
1489
1490 FLOAT_TO_UINT(32, 16)
1491 FLOAT_TO_UINT(32, 32)
1492 FLOAT_TO_UINT(32, 64)
1493
1494 FLOAT_TO_UINT(64, 16)
1495 FLOAT_TO_UINT(64, 32)
1496 FLOAT_TO_UINT(64, 64)
1497
1498 #undef FLOAT_TO_UINT
1499
1500 /*
1501  * Integer to float conversions
1502  *
1503  * Returns the result of converting the two's complement integer `a'
1504  * to the floating-point format. The conversion is performed according
1505  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1506  */
1507
1508 static FloatParts int_to_float(int64_t a, float_status *status)
1509 {
1510     FloatParts r = {};
1511     if (a == 0) {
1512         r.cls = float_class_zero;
1513         r.sign = false;
1514     } else if (a == (1ULL << 63)) {
1515         r.cls = float_class_normal;
1516         r.sign = true;
1517         r.frac = DECOMPOSED_IMPLICIT_BIT;
1518         r.exp = 63;
1519     } else {
1520         uint64_t f;
1521         if (a < 0) {
1522             f = -a;
1523             r.sign = true;
1524         } else {
1525             f = a;
1526             r.sign = false;
1527         }
1528         int shift = clz64(f) - 1;
1529         r.cls = float_class_normal;
1530         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1531         r.frac = f << shift;
1532     }
1533
1534     return r;
1535 }
1536
1537 float16 int64_to_float16(int64_t a, float_status *status)
1538 {
1539     FloatParts pa = int_to_float(a, status);
1540     return float16_round_pack_canonical(pa, status);
1541 }
1542
1543 float16 int32_to_float16(int32_t a, float_status *status)
1544 {
1545     return int64_to_float16(a, status);
1546 }
1547
1548 float16 int16_to_float16(int16_t a, float_status *status)
1549 {
1550     return int64_to_float16(a, status);
1551 }
1552
1553 float32 int64_to_float32(int64_t a, float_status *status)
1554 {
1555     FloatParts pa = int_to_float(a, status);
1556     return float32_round_pack_canonical(pa, status);
1557 }
1558
1559 float32 int32_to_float32(int32_t a, float_status *status)
1560 {
1561     return int64_to_float32(a, status);
1562 }
1563
1564 float32 int16_to_float32(int16_t a, float_status *status)
1565 {
1566     return int64_to_float32(a, status);
1567 }
1568
1569 float64 int64_to_float64(int64_t a, float_status *status)
1570 {
1571     FloatParts pa = int_to_float(a, status);
1572     return float64_round_pack_canonical(pa, status);
1573 }
1574
1575 float64 int32_to_float64(int32_t a, float_status *status)
1576 {
1577     return int64_to_float64(a, status);
1578 }
1579
1580 float64 int16_to_float64(int16_t a, float_status *status)
1581 {
1582     return int64_to_float64(a, status);
1583 }
1584
1585
1586 /*
1587  * Unsigned Integer to float conversions
1588  *
1589  * Returns the result of converting the unsigned integer `a' to the
1590  * floating-point format. The conversion is performed according to the
1591  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1592  */
1593
1594 static FloatParts uint_to_float(uint64_t a, float_status *status)
1595 {
1596     FloatParts r = { .sign = false};
1597
1598     if (a == 0) {
1599         r.cls = float_class_zero;
1600     } else {
1601         int spare_bits = clz64(a) - 1;
1602         r.cls = float_class_normal;
1603         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1604         if (spare_bits < 0) {
1605             shift64RightJamming(a, -spare_bits, &a);
1606             r.frac = a;
1607         } else {
1608             r.frac = a << spare_bits;
1609         }
1610     }
1611
1612     return r;
1613 }
1614
1615 float16 uint64_to_float16(uint64_t a, float_status *status)
1616 {
1617     FloatParts pa = uint_to_float(a, status);
1618     return float16_round_pack_canonical(pa, status);
1619 }
1620
1621 float16 uint32_to_float16(uint32_t a, float_status *status)
1622 {
1623     return uint64_to_float16(a, status);
1624 }
1625
1626 float16 uint16_to_float16(uint16_t a, float_status *status)
1627 {
1628     return uint64_to_float16(a, status);
1629 }
1630
1631 float32 uint64_to_float32(uint64_t a, float_status *status)
1632 {
1633     FloatParts pa = uint_to_float(a, status);
1634     return float32_round_pack_canonical(pa, status);
1635 }
1636
1637 float32 uint32_to_float32(uint32_t a, float_status *status)
1638 {
1639     return uint64_to_float32(a, status);
1640 }
1641
1642 float32 uint16_to_float32(uint16_t a, float_status *status)
1643 {
1644     return uint64_to_float32(a, status);
1645 }
1646
1647 float64 uint64_to_float64(uint64_t a, float_status *status)
1648 {
1649     FloatParts pa = uint_to_float(a, status);
1650     return float64_round_pack_canonical(pa, status);
1651 }
1652
1653 float64 uint32_to_float64(uint32_t a, float_status *status)
1654 {
1655     return uint64_to_float64(a, status);
1656 }
1657
1658 float64 uint16_to_float64(uint16_t a, float_status *status)
1659 {
1660     return uint64_to_float64(a, status);
1661 }
1662
1663 /* Float Min/Max */
1664 /* min() and max() functions. These can't be implemented as
1665  * 'compare and pick one input' because that would mishandle
1666  * NaNs and +0 vs -0.
1667  *
1668  * minnum() and maxnum() functions. These are similar to the min()
1669  * and max() functions but if one of the arguments is a QNaN and
1670  * the other is numerical then the numerical argument is returned.
1671  * SNaNs will get quietened before being returned.
1672  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1673  * and maxNum() operations. min() and max() are the typical min/max
1674  * semantics provided by many CPUs which predate that specification.
1675  *
1676  * minnummag() and maxnummag() functions correspond to minNumMag()
1677  * and minNumMag() from the IEEE-754 2008.
1678  */
1679 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1680                                 bool ieee, bool ismag, float_status *s)
1681 {
1682     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1683         if (ieee) {
1684             /* Takes two floating-point values `a' and `b', one of
1685              * which is a NaN, and returns the appropriate NaN
1686              * result. If either `a' or `b' is a signaling NaN,
1687              * the invalid exception is raised.
1688              */
1689             if (is_snan(a.cls) || is_snan(b.cls)) {
1690                 return pick_nan(a, b, s);
1691             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1692                 return b;
1693             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1694                 return a;
1695             }
1696         }
1697         return pick_nan(a, b, s);
1698     } else {
1699         int a_exp, b_exp;
1700
1701         switch (a.cls) {
1702         case float_class_normal:
1703             a_exp = a.exp;
1704             break;
1705         case float_class_inf:
1706             a_exp = INT_MAX;
1707             break;
1708         case float_class_zero:
1709             a_exp = INT_MIN;
1710             break;
1711         default:
1712             g_assert_not_reached();
1713             break;
1714         }
1715         switch (b.cls) {
1716         case float_class_normal:
1717             b_exp = b.exp;
1718             break;
1719         case float_class_inf:
1720             b_exp = INT_MAX;
1721             break;
1722         case float_class_zero:
1723             b_exp = INT_MIN;
1724             break;
1725         default:
1726             g_assert_not_reached();
1727             break;
1728         }
1729
1730         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1731             bool a_less = a_exp < b_exp;
1732             if (a_exp == b_exp) {
1733                 a_less = a.frac < b.frac;
1734             }
1735             return a_less ^ ismin ? b : a;
1736         }
1737
1738         if (a.sign == b.sign) {
1739             bool a_less = a_exp < b_exp;
1740             if (a_exp == b_exp) {
1741                 a_less = a.frac < b.frac;
1742             }
1743             return a.sign ^ a_less ^ ismin ? b : a;
1744         } else {
1745             return a.sign ^ ismin ? b : a;
1746         }
1747     }
1748 }
1749
1750 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1751 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1752                                      float_status *s)                   \
1753 {                                                                       \
1754     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1755     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1756     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1757                                                                         \
1758     return float ## sz ## _round_pack_canonical(pr, s);                 \
1759 }
1760
1761 MINMAX(16, min, true, false, false)
1762 MINMAX(16, minnum, true, true, false)
1763 MINMAX(16, minnummag, true, true, true)
1764 MINMAX(16, max, false, false, false)
1765 MINMAX(16, maxnum, false, true, false)
1766 MINMAX(16, maxnummag, false, true, true)
1767
1768 MINMAX(32, min, true, false, false)
1769 MINMAX(32, minnum, true, true, false)
1770 MINMAX(32, minnummag, true, true, true)
1771 MINMAX(32, max, false, false, false)
1772 MINMAX(32, maxnum, false, true, false)
1773 MINMAX(32, maxnummag, false, true, true)
1774
1775 MINMAX(64, min, true, false, false)
1776 MINMAX(64, minnum, true, true, false)
1777 MINMAX(64, minnummag, true, true, true)
1778 MINMAX(64, max, false, false, false)
1779 MINMAX(64, maxnum, false, true, false)
1780 MINMAX(64, maxnummag, false, true, true)
1781
1782 #undef MINMAX
1783
1784 /* Floating point compare */
1785 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1786                           float_status *s)
1787 {
1788     if (is_nan(a.cls) || is_nan(b.cls)) {
1789         if (!is_quiet ||
1790             a.cls == float_class_snan ||
1791             b.cls == float_class_snan) {
1792             s->float_exception_flags |= float_flag_invalid;
1793         }
1794         return float_relation_unordered;
1795     }
1796
1797     if (a.cls == float_class_zero) {
1798         if (b.cls == float_class_zero) {
1799             return float_relation_equal;
1800         }
1801         return b.sign ? float_relation_greater : float_relation_less;
1802     } else if (b.cls == float_class_zero) {
1803         return a.sign ? float_relation_less : float_relation_greater;
1804     }
1805
1806     /* The only really important thing about infinity is its sign. If
1807      * both are infinities the sign marks the smallest of the two.
1808      */
1809     if (a.cls == float_class_inf) {
1810         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1811             return float_relation_equal;
1812         }
1813         return a.sign ? float_relation_less : float_relation_greater;
1814     } else if (b.cls == float_class_inf) {
1815         return b.sign ? float_relation_greater : float_relation_less;
1816     }
1817
1818     if (a.sign != b.sign) {
1819         return a.sign ? float_relation_less : float_relation_greater;
1820     }
1821
1822     if (a.exp == b.exp) {
1823         if (a.frac == b.frac) {
1824             return float_relation_equal;
1825         }
1826         if (a.sign) {
1827             return a.frac > b.frac ?
1828                 float_relation_less : float_relation_greater;
1829         } else {
1830             return a.frac > b.frac ?
1831                 float_relation_greater : float_relation_less;
1832         }
1833     } else {
1834         if (a.sign) {
1835             return a.exp > b.exp ? float_relation_less : float_relation_greater;
1836         } else {
1837             return a.exp > b.exp ? float_relation_greater : float_relation_less;
1838         }
1839     }
1840 }
1841
1842 #define COMPARE(sz)                                                     \
1843 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1844                             float_status *s)                            \
1845 {                                                                       \
1846     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1847     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1848     return compare_floats(pa, pb, false, s);                            \
1849 }                                                                       \
1850 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1851                                   float_status *s)                      \
1852 {                                                                       \
1853     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1854     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1855     return compare_floats(pa, pb, true, s);                             \
1856 }
1857
1858 COMPARE(16)
1859 COMPARE(32)
1860 COMPARE(64)
1861
1862 #undef COMPARE
1863
1864 /* Multiply A by 2 raised to the power N.  */
1865 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1866 {
1867     if (unlikely(is_nan(a.cls))) {
1868         return return_nan(a, s);
1869     }
1870     if (a.cls == float_class_normal) {
1871         /* The largest float type (even though not supported by FloatParts)
1872          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
1873          * still allows rounding to infinity, without allowing overflow
1874          * within the int32_t that backs FloatParts.exp.
1875          */
1876         n = MIN(MAX(n, -0x10000), 0x10000);
1877         a.exp += n;
1878     }
1879     return a;
1880 }
1881
1882 float16 float16_scalbn(float16 a, int n, float_status *status)
1883 {
1884     FloatParts pa = float16_unpack_canonical(a, status);
1885     FloatParts pr = scalbn_decomposed(pa, n, status);
1886     return float16_round_pack_canonical(pr, status);
1887 }
1888
1889 float32 float32_scalbn(float32 a, int n, float_status *status)
1890 {
1891     FloatParts pa = float32_unpack_canonical(a, status);
1892     FloatParts pr = scalbn_decomposed(pa, n, status);
1893     return float32_round_pack_canonical(pr, status);
1894 }
1895
1896 float64 float64_scalbn(float64 a, int n, float_status *status)
1897 {
1898     FloatParts pa = float64_unpack_canonical(a, status);
1899     FloatParts pr = scalbn_decomposed(pa, n, status);
1900     return float64_round_pack_canonical(pr, status);
1901 }
1902
1903 /*
1904  * Square Root
1905  *
1906  * The old softfloat code did an approximation step before zeroing in
1907  * on the final result. However for simpleness we just compute the
1908  * square root by iterating down from the implicit bit to enough extra
1909  * bits to ensure we get a correctly rounded result.
1910  *
1911  * This does mean however the calculation is slower than before,
1912  * especially for 64 bit floats.
1913  */
1914
1915 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1916 {
1917     uint64_t a_frac, r_frac, s_frac;
1918     int bit, last_bit;
1919
1920     if (is_nan(a.cls)) {
1921         return return_nan(a, s);
1922     }
1923     if (a.cls == float_class_zero) {
1924         return a;  /* sqrt(+-0) = +-0 */
1925     }
1926     if (a.sign) {
1927         s->float_exception_flags |= float_flag_invalid;
1928         return parts_default_nan(s);
1929     }
1930     if (a.cls == float_class_inf) {
1931         return a;  /* sqrt(+inf) = +inf */
1932     }
1933
1934     assert(a.cls == float_class_normal);
1935
1936     /* We need two overflow bits at the top. Adding room for that is a
1937      * right shift. If the exponent is odd, we can discard the low bit
1938      * by multiplying the fraction by 2; that's a left shift. Combine
1939      * those and we shift right if the exponent is even.
1940      */
1941     a_frac = a.frac;
1942     if (!(a.exp & 1)) {
1943         a_frac >>= 1;
1944     }
1945     a.exp >>= 1;
1946
1947     /* Bit-by-bit computation of sqrt.  */
1948     r_frac = 0;
1949     s_frac = 0;
1950
1951     /* Iterate from implicit bit down to the 3 extra bits to compute a
1952      * properly rounded result. Remember we've inserted one more bit
1953      * at the top, so these positions are one less.
1954      */
1955     bit = DECOMPOSED_BINARY_POINT - 1;
1956     last_bit = MAX(p->frac_shift - 4, 0);
1957     do {
1958         uint64_t q = 1ULL << bit;
1959         uint64_t t_frac = s_frac + q;
1960         if (t_frac <= a_frac) {
1961             s_frac = t_frac + q;
1962             a_frac -= t_frac;
1963             r_frac += q;
1964         }
1965         a_frac <<= 1;
1966     } while (--bit >= last_bit);
1967
1968     /* Undo the right shift done above. If there is any remaining
1969      * fraction, the result is inexact. Set the sticky bit.
1970      */
1971     a.frac = (r_frac << 1) + (a_frac != 0);
1972
1973     return a;
1974 }
1975
1976 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1977 {
1978     FloatParts pa = float16_unpack_canonical(a, status);
1979     FloatParts pr = sqrt_float(pa, status, &float16_params);
1980     return float16_round_pack_canonical(pr, status);
1981 }
1982
1983 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1984 {
1985     FloatParts pa = float32_unpack_canonical(a, status);
1986     FloatParts pr = sqrt_float(pa, status, &float32_params);
1987     return float32_round_pack_canonical(pr, status);
1988 }
1989
1990 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1991 {
1992     FloatParts pa = float64_unpack_canonical(a, status);
1993     FloatParts pr = sqrt_float(pa, status, &float64_params);
1994     return float64_round_pack_canonical(pr, status);
1995 }
1996
1997
1998 /*----------------------------------------------------------------------------
1999 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2000 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2001 | input.  If `zSign' is 1, the input is negated before being converted to an
2002 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2003 | is simply rounded to an integer, with the inexact exception raised if the
2004 | input cannot be represented exactly as an integer.  However, if the fixed-
2005 | point input is too large, the invalid exception is raised and the largest
2006 | positive or negative integer is returned.
2007 *----------------------------------------------------------------------------*/
2008
2009 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2010 {
2011     int8_t roundingMode;
2012     flag roundNearestEven;
2013     int8_t roundIncrement, roundBits;
2014     int32_t z;
2015
2016     roundingMode = status->float_rounding_mode;
2017     roundNearestEven = ( roundingMode == float_round_nearest_even );
2018     switch (roundingMode) {
2019     case float_round_nearest_even:
2020     case float_round_ties_away:
2021         roundIncrement = 0x40;
2022         break;
2023     case float_round_to_zero:
2024         roundIncrement = 0;
2025         break;
2026     case float_round_up:
2027         roundIncrement = zSign ? 0 : 0x7f;
2028         break;
2029     case float_round_down:
2030         roundIncrement = zSign ? 0x7f : 0;
2031         break;
2032     default:
2033         abort();
2034     }
2035     roundBits = absZ & 0x7F;
2036     absZ = ( absZ + roundIncrement )>>7;
2037     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2038     z = absZ;
2039     if ( zSign ) z = - z;
2040     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2041         float_raise(float_flag_invalid, status);
2042         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2043     }
2044     if (roundBits) {
2045         status->float_exception_flags |= float_flag_inexact;
2046     }
2047     return z;
2048
2049 }
2050
2051 /*----------------------------------------------------------------------------
2052 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2053 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2054 | and returns the properly rounded 64-bit integer corresponding to the input.
2055 | If `zSign' is 1, the input is negated before being converted to an integer.
2056 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2057 | the inexact exception raised if the input cannot be represented exactly as
2058 | an integer.  However, if the fixed-point input is too large, the invalid
2059 | exception is raised and the largest positive or negative integer is
2060 | returned.
2061 *----------------------------------------------------------------------------*/
2062
2063 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2064                                float_status *status)
2065 {
2066     int8_t roundingMode;
2067     flag roundNearestEven, increment;
2068     int64_t z;
2069
2070     roundingMode = status->float_rounding_mode;
2071     roundNearestEven = ( roundingMode == float_round_nearest_even );
2072     switch (roundingMode) {
2073     case float_round_nearest_even:
2074     case float_round_ties_away:
2075         increment = ((int64_t) absZ1 < 0);
2076         break;
2077     case float_round_to_zero:
2078         increment = 0;
2079         break;
2080     case float_round_up:
2081         increment = !zSign && absZ1;
2082         break;
2083     case float_round_down:
2084         increment = zSign && absZ1;
2085         break;
2086     default:
2087         abort();
2088     }
2089     if ( increment ) {
2090         ++absZ0;
2091         if ( absZ0 == 0 ) goto overflow;
2092         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2093     }
2094     z = absZ0;
2095     if ( zSign ) z = - z;
2096     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2097  overflow:
2098         float_raise(float_flag_invalid, status);
2099         return
2100               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2101             : LIT64( 0x7FFFFFFFFFFFFFFF );
2102     }
2103     if (absZ1) {
2104         status->float_exception_flags |= float_flag_inexact;
2105     }
2106     return z;
2107
2108 }
2109
2110 /*----------------------------------------------------------------------------
2111 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2112 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2113 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2114 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2115 | with the inexact exception raised if the input cannot be represented exactly
2116 | as an integer.  However, if the fixed-point input is too large, the invalid
2117 | exception is raised and the largest unsigned integer is returned.
2118 *----------------------------------------------------------------------------*/
2119
2120 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2121                                 uint64_t absZ1, float_status *status)
2122 {
2123     int8_t roundingMode;
2124     flag roundNearestEven, increment;
2125
2126     roundingMode = status->float_rounding_mode;
2127     roundNearestEven = (roundingMode == float_round_nearest_even);
2128     switch (roundingMode) {
2129     case float_round_nearest_even:
2130     case float_round_ties_away:
2131         increment = ((int64_t)absZ1 < 0);
2132         break;
2133     case float_round_to_zero:
2134         increment = 0;
2135         break;
2136     case float_round_up:
2137         increment = !zSign && absZ1;
2138         break;
2139     case float_round_down:
2140         increment = zSign && absZ1;
2141         break;
2142     default:
2143         abort();
2144     }
2145     if (increment) {
2146         ++absZ0;
2147         if (absZ0 == 0) {
2148             float_raise(float_flag_invalid, status);
2149             return LIT64(0xFFFFFFFFFFFFFFFF);
2150         }
2151         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2152     }
2153
2154     if (zSign && absZ0) {
2155         float_raise(float_flag_invalid, status);
2156         return 0;
2157     }
2158
2159     if (absZ1) {
2160         status->float_exception_flags |= float_flag_inexact;
2161     }
2162     return absZ0;
2163 }
2164
2165 /*----------------------------------------------------------------------------
2166 | If `a' is denormal and we are in flush-to-zero mode then set the
2167 | input-denormal exception and return zero. Otherwise just return the value.
2168 *----------------------------------------------------------------------------*/
2169 float32 float32_squash_input_denormal(float32 a, float_status *status)
2170 {
2171     if (status->flush_inputs_to_zero) {
2172         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2173             float_raise(float_flag_input_denormal, status);
2174             return make_float32(float32_val(a) & 0x80000000);
2175         }
2176     }
2177     return a;
2178 }
2179
2180 /*----------------------------------------------------------------------------
2181 | Normalizes the subnormal single-precision floating-point value represented
2182 | by the denormalized significand `aSig'.  The normalized exponent and
2183 | significand are stored at the locations pointed to by `zExpPtr' and
2184 | `zSigPtr', respectively.
2185 *----------------------------------------------------------------------------*/
2186
2187 static void
2188  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2189 {
2190     int8_t shiftCount;
2191
2192     shiftCount = countLeadingZeros32( aSig ) - 8;
2193     *zSigPtr = aSig<<shiftCount;
2194     *zExpPtr = 1 - shiftCount;
2195
2196 }
2197
2198 /*----------------------------------------------------------------------------
2199 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2200 | and significand `zSig', and returns the proper single-precision floating-
2201 | point value corresponding to the abstract input.  Ordinarily, the abstract
2202 | value is simply rounded and packed into the single-precision format, with
2203 | the inexact exception raised if the abstract input cannot be represented
2204 | exactly.  However, if the abstract value is too large, the overflow and
2205 | inexact exceptions are raised and an infinity or maximal finite value is
2206 | returned.  If the abstract value is too small, the input value is rounded to
2207 | a subnormal number, and the underflow and inexact exceptions are raised if
2208 | the abstract input cannot be represented exactly as a subnormal single-
2209 | precision floating-point number.
2210 |     The input significand `zSig' has its binary point between bits 30
2211 | and 29, which is 7 bits to the left of the usual location.  This shifted
2212 | significand must be normalized or smaller.  If `zSig' is not normalized,
2213 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2214 | and it must not require rounding.  In the usual case that `zSig' is
2215 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2216 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2217 | Binary Floating-Point Arithmetic.
2218 *----------------------------------------------------------------------------*/
2219
2220 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2221                                    float_status *status)
2222 {
2223     int8_t roundingMode;
2224     flag roundNearestEven;
2225     int8_t roundIncrement, roundBits;
2226     flag isTiny;
2227
2228     roundingMode = status->float_rounding_mode;
2229     roundNearestEven = ( roundingMode == float_round_nearest_even );
2230     switch (roundingMode) {
2231     case float_round_nearest_even:
2232     case float_round_ties_away:
2233         roundIncrement = 0x40;
2234         break;
2235     case float_round_to_zero:
2236         roundIncrement = 0;
2237         break;
2238     case float_round_up:
2239         roundIncrement = zSign ? 0 : 0x7f;
2240         break;
2241     case float_round_down:
2242         roundIncrement = zSign ? 0x7f : 0;
2243         break;
2244     default:
2245         abort();
2246         break;
2247     }
2248     roundBits = zSig & 0x7F;
2249     if ( 0xFD <= (uint16_t) zExp ) {
2250         if (    ( 0xFD < zExp )
2251              || (    ( zExp == 0xFD )
2252                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2253            ) {
2254             float_raise(float_flag_overflow | float_flag_inexact, status);
2255             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2256         }
2257         if ( zExp < 0 ) {
2258             if (status->flush_to_zero) {
2259                 float_raise(float_flag_output_denormal, status);
2260                 return packFloat32(zSign, 0, 0);
2261             }
2262             isTiny =
2263                 (status->float_detect_tininess
2264                  == float_tininess_before_rounding)
2265                 || ( zExp < -1 )
2266                 || ( zSig + roundIncrement < 0x80000000 );
2267             shift32RightJamming( zSig, - zExp, &zSig );
2268             zExp = 0;
2269             roundBits = zSig & 0x7F;
2270             if (isTiny && roundBits) {
2271                 float_raise(float_flag_underflow, status);
2272             }
2273         }
2274     }
2275     if (roundBits) {
2276         status->float_exception_flags |= float_flag_inexact;
2277     }
2278     zSig = ( zSig + roundIncrement )>>7;
2279     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2280     if ( zSig == 0 ) zExp = 0;
2281     return packFloat32( zSign, zExp, zSig );
2282
2283 }
2284
2285 /*----------------------------------------------------------------------------
2286 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2287 | and significand `zSig', and returns the proper single-precision floating-
2288 | point value corresponding to the abstract input.  This routine is just like
2289 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2290 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2291 | floating-point exponent.
2292 *----------------------------------------------------------------------------*/
2293
2294 static float32
2295  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2296                               float_status *status)
2297 {
2298     int8_t shiftCount;
2299
2300     shiftCount = countLeadingZeros32( zSig ) - 1;
2301     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2302                                status);
2303
2304 }
2305
2306 /*----------------------------------------------------------------------------
2307 | If `a' is denormal and we are in flush-to-zero mode then set the
2308 | input-denormal exception and return zero. Otherwise just return the value.
2309 *----------------------------------------------------------------------------*/
2310 float64 float64_squash_input_denormal(float64 a, float_status *status)
2311 {
2312     if (status->flush_inputs_to_zero) {
2313         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2314             float_raise(float_flag_input_denormal, status);
2315             return make_float64(float64_val(a) & (1ULL << 63));
2316         }
2317     }
2318     return a;
2319 }
2320
2321 /*----------------------------------------------------------------------------
2322 | Normalizes the subnormal double-precision floating-point value represented
2323 | by the denormalized significand `aSig'.  The normalized exponent and
2324 | significand are stored at the locations pointed to by `zExpPtr' and
2325 | `zSigPtr', respectively.
2326 *----------------------------------------------------------------------------*/
2327
2328 static void
2329  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2330 {
2331     int8_t shiftCount;
2332
2333     shiftCount = countLeadingZeros64( aSig ) - 11;
2334     *zSigPtr = aSig<<shiftCount;
2335     *zExpPtr = 1 - shiftCount;
2336
2337 }
2338
2339 /*----------------------------------------------------------------------------
2340 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2341 | double-precision floating-point value, returning the result.  After being
2342 | shifted into the proper positions, the three fields are simply added
2343 | together to form the result.  This means that any integer portion of `zSig'
2344 | will be added into the exponent.  Since a properly normalized significand
2345 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2346 | than the desired result exponent whenever `zSig' is a complete, normalized
2347 | significand.
2348 *----------------------------------------------------------------------------*/
2349
2350 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2351 {
2352
2353     return make_float64(
2354         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2355
2356 }
2357
2358 /*----------------------------------------------------------------------------
2359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2360 | and significand `zSig', and returns the proper double-precision floating-
2361 | point value corresponding to the abstract input.  Ordinarily, the abstract
2362 | value is simply rounded and packed into the double-precision format, with
2363 | the inexact exception raised if the abstract input cannot be represented
2364 | exactly.  However, if the abstract value is too large, the overflow and
2365 | inexact exceptions are raised and an infinity or maximal finite value is
2366 | returned.  If the abstract value is too small, the input value is rounded to
2367 | a subnormal number, and the underflow and inexact exceptions are raised if
2368 | the abstract input cannot be represented exactly as a subnormal double-
2369 | precision floating-point number.
2370 |     The input significand `zSig' has its binary point between bits 62
2371 | and 61, which is 10 bits to the left of the usual location.  This shifted
2372 | significand must be normalized or smaller.  If `zSig' is not normalized,
2373 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2374 | and it must not require rounding.  In the usual case that `zSig' is
2375 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2376 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2377 | Binary Floating-Point Arithmetic.
2378 *----------------------------------------------------------------------------*/
2379
2380 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2381                                    float_status *status)
2382 {
2383     int8_t roundingMode;
2384     flag roundNearestEven;
2385     int roundIncrement, roundBits;
2386     flag isTiny;
2387
2388     roundingMode = status->float_rounding_mode;
2389     roundNearestEven = ( roundingMode == float_round_nearest_even );
2390     switch (roundingMode) {
2391     case float_round_nearest_even:
2392     case float_round_ties_away:
2393         roundIncrement = 0x200;
2394         break;
2395     case float_round_to_zero:
2396         roundIncrement = 0;
2397         break;
2398     case float_round_up:
2399         roundIncrement = zSign ? 0 : 0x3ff;
2400         break;
2401     case float_round_down:
2402         roundIncrement = zSign ? 0x3ff : 0;
2403         break;
2404     case float_round_to_odd:
2405         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2406         break;
2407     default:
2408         abort();
2409     }
2410     roundBits = zSig & 0x3FF;
2411     if ( 0x7FD <= (uint16_t) zExp ) {
2412         if (    ( 0x7FD < zExp )
2413              || (    ( zExp == 0x7FD )
2414                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2415            ) {
2416             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2417                                    roundIncrement != 0;
2418             float_raise(float_flag_overflow | float_flag_inexact, status);
2419             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2420         }
2421         if ( zExp < 0 ) {
2422             if (status->flush_to_zero) {
2423                 float_raise(float_flag_output_denormal, status);
2424                 return packFloat64(zSign, 0, 0);
2425             }
2426             isTiny =
2427                    (status->float_detect_tininess
2428                     == float_tininess_before_rounding)
2429                 || ( zExp < -1 )
2430                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2431             shift64RightJamming( zSig, - zExp, &zSig );
2432             zExp = 0;
2433             roundBits = zSig & 0x3FF;
2434             if (isTiny && roundBits) {
2435                 float_raise(float_flag_underflow, status);
2436             }
2437             if (roundingMode == float_round_to_odd) {
2438                 /*
2439                  * For round-to-odd case, the roundIncrement depends on
2440                  * zSig which just changed.
2441                  */
2442                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2443             }
2444         }
2445     }
2446     if (roundBits) {
2447         status->float_exception_flags |= float_flag_inexact;
2448     }
2449     zSig = ( zSig + roundIncrement )>>10;
2450     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2451     if ( zSig == 0 ) zExp = 0;
2452     return packFloat64( zSign, zExp, zSig );
2453
2454 }
2455
2456 /*----------------------------------------------------------------------------
2457 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2458 | and significand `zSig', and returns the proper double-precision floating-
2459 | point value corresponding to the abstract input.  This routine is just like
2460 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2461 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2462 | floating-point exponent.
2463 *----------------------------------------------------------------------------*/
2464
2465 static float64
2466  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2467                               float_status *status)
2468 {
2469     int8_t shiftCount;
2470
2471     shiftCount = countLeadingZeros64( zSig ) - 1;
2472     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2473                                status);
2474
2475 }
2476
2477 /*----------------------------------------------------------------------------
2478 | Normalizes the subnormal extended double-precision floating-point value
2479 | represented by the denormalized significand `aSig'.  The normalized exponent
2480 | and significand are stored at the locations pointed to by `zExpPtr' and
2481 | `zSigPtr', respectively.
2482 *----------------------------------------------------------------------------*/
2483
2484 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2485                                 uint64_t *zSigPtr)
2486 {
2487     int8_t shiftCount;
2488
2489     shiftCount = countLeadingZeros64( aSig );
2490     *zSigPtr = aSig<<shiftCount;
2491     *zExpPtr = 1 - shiftCount;
2492 }
2493
2494 /*----------------------------------------------------------------------------
2495 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2496 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2497 | and returns the proper extended double-precision floating-point value
2498 | corresponding to the abstract input.  Ordinarily, the abstract value is
2499 | rounded and packed into the extended double-precision format, with the
2500 | inexact exception raised if the abstract input cannot be represented
2501 | exactly.  However, if the abstract value is too large, the overflow and
2502 | inexact exceptions are raised and an infinity or maximal finite value is
2503 | returned.  If the abstract value is too small, the input value is rounded to
2504 | a subnormal number, and the underflow and inexact exceptions are raised if
2505 | the abstract input cannot be represented exactly as a subnormal extended
2506 | double-precision floating-point number.
2507 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2508 | number of bits as single or double precision, respectively.  Otherwise, the
2509 | result is rounded to the full precision of the extended double-precision
2510 | format.
2511 |     The input significand must be normalized or smaller.  If the input
2512 | significand is not normalized, `zExp' must be 0; in that case, the result
2513 | returned is a subnormal number, and it must not require rounding.  The
2514 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2515 | Floating-Point Arithmetic.
2516 *----------------------------------------------------------------------------*/
2517
2518 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2519                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2520                               float_status *status)
2521 {
2522     int8_t roundingMode;
2523     flag roundNearestEven, increment, isTiny;
2524     int64_t roundIncrement, roundMask, roundBits;
2525
2526     roundingMode = status->float_rounding_mode;
2527     roundNearestEven = ( roundingMode == float_round_nearest_even );
2528     if ( roundingPrecision == 80 ) goto precision80;
2529     if ( roundingPrecision == 64 ) {
2530         roundIncrement = LIT64( 0x0000000000000400 );
2531         roundMask = LIT64( 0x00000000000007FF );
2532     }
2533     else if ( roundingPrecision == 32 ) {
2534         roundIncrement = LIT64( 0x0000008000000000 );
2535         roundMask = LIT64( 0x000000FFFFFFFFFF );
2536     }
2537     else {
2538         goto precision80;
2539     }
2540     zSig0 |= ( zSig1 != 0 );
2541     switch (roundingMode) {
2542     case float_round_nearest_even:
2543     case float_round_ties_away:
2544         break;
2545     case float_round_to_zero:
2546         roundIncrement = 0;
2547         break;
2548     case float_round_up:
2549         roundIncrement = zSign ? 0 : roundMask;
2550         break;
2551     case float_round_down:
2552         roundIncrement = zSign ? roundMask : 0;
2553         break;
2554     default:
2555         abort();
2556     }
2557     roundBits = zSig0 & roundMask;
2558     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2559         if (    ( 0x7FFE < zExp )
2560              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2561            ) {
2562             goto overflow;
2563         }
2564         if ( zExp <= 0 ) {
2565             if (status->flush_to_zero) {
2566                 float_raise(float_flag_output_denormal, status);
2567                 return packFloatx80(zSign, 0, 0);
2568             }
2569             isTiny =
2570                    (status->float_detect_tininess
2571                     == float_tininess_before_rounding)
2572                 || ( zExp < 0 )
2573                 || ( zSig0 <= zSig0 + roundIncrement );
2574             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2575             zExp = 0;
2576             roundBits = zSig0 & roundMask;
2577             if (isTiny && roundBits) {
2578                 float_raise(float_flag_underflow, status);
2579             }
2580             if (roundBits) {
2581                 status->float_exception_flags |= float_flag_inexact;
2582             }
2583             zSig0 += roundIncrement;
2584             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2585             roundIncrement = roundMask + 1;
2586             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2587                 roundMask |= roundIncrement;
2588             }
2589             zSig0 &= ~ roundMask;
2590             return packFloatx80( zSign, zExp, zSig0 );
2591         }
2592     }
2593     if (roundBits) {
2594         status->float_exception_flags |= float_flag_inexact;
2595     }
2596     zSig0 += roundIncrement;
2597     if ( zSig0 < roundIncrement ) {
2598         ++zExp;
2599         zSig0 = LIT64( 0x8000000000000000 );
2600     }
2601     roundIncrement = roundMask + 1;
2602     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2603         roundMask |= roundIncrement;
2604     }
2605     zSig0 &= ~ roundMask;
2606     if ( zSig0 == 0 ) zExp = 0;
2607     return packFloatx80( zSign, zExp, zSig0 );
2608  precision80:
2609     switch (roundingMode) {
2610     case float_round_nearest_even:
2611     case float_round_ties_away:
2612         increment = ((int64_t)zSig1 < 0);
2613         break;
2614     case float_round_to_zero:
2615         increment = 0;
2616         break;
2617     case float_round_up:
2618         increment = !zSign && zSig1;
2619         break;
2620     case float_round_down:
2621         increment = zSign && zSig1;
2622         break;
2623     default:
2624         abort();
2625     }
2626     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2627         if (    ( 0x7FFE < zExp )
2628              || (    ( zExp == 0x7FFE )
2629                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2630                   && increment
2631                 )
2632            ) {
2633             roundMask = 0;
2634  overflow:
2635             float_raise(float_flag_overflow | float_flag_inexact, status);
2636             if (    ( roundingMode == float_round_to_zero )
2637                  || ( zSign && ( roundingMode == float_round_up ) )
2638                  || ( ! zSign && ( roundingMode == float_round_down ) )
2639                ) {
2640                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2641             }
2642             return packFloatx80(zSign,
2643                                 floatx80_infinity_high,
2644                                 floatx80_infinity_low);
2645         }
2646         if ( zExp <= 0 ) {
2647             isTiny =
2648                    (status->float_detect_tininess
2649                     == float_tininess_before_rounding)
2650                 || ( zExp < 0 )
2651                 || ! increment
2652                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2653             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2654             zExp = 0;
2655             if (isTiny && zSig1) {
2656                 float_raise(float_flag_underflow, status);
2657             }
2658             if (zSig1) {
2659                 status->float_exception_flags |= float_flag_inexact;
2660             }
2661             switch (roundingMode) {
2662             case float_round_nearest_even:
2663             case float_round_ties_away:
2664                 increment = ((int64_t)zSig1 < 0);
2665                 break;
2666             case float_round_to_zero:
2667                 increment = 0;
2668                 break;
2669             case float_round_up:
2670                 increment = !zSign && zSig1;
2671                 break;
2672             case float_round_down:
2673                 increment = zSign && zSig1;
2674                 break;
2675             default:
2676                 abort();
2677             }
2678             if ( increment ) {
2679                 ++zSig0;
2680                 zSig0 &=
2681                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2682                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2683             }
2684             return packFloatx80( zSign, zExp, zSig0 );
2685         }
2686     }
2687     if (zSig1) {
2688         status->float_exception_flags |= float_flag_inexact;
2689     }
2690     if ( increment ) {
2691         ++zSig0;
2692         if ( zSig0 == 0 ) {
2693             ++zExp;
2694             zSig0 = LIT64( 0x8000000000000000 );
2695         }
2696         else {
2697             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2698         }
2699     }
2700     else {
2701         if ( zSig0 == 0 ) zExp = 0;
2702     }
2703     return packFloatx80( zSign, zExp, zSig0 );
2704
2705 }
2706
2707 /*----------------------------------------------------------------------------
2708 | Takes an abstract floating-point value having sign `zSign', exponent
2709 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2710 | and returns the proper extended double-precision floating-point value
2711 | corresponding to the abstract input.  This routine is just like
2712 | `roundAndPackFloatx80' except that the input significand does not have to be
2713 | normalized.
2714 *----------------------------------------------------------------------------*/
2715
2716 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2717                                        flag zSign, int32_t zExp,
2718                                        uint64_t zSig0, uint64_t zSig1,
2719                                        float_status *status)
2720 {
2721     int8_t shiftCount;
2722
2723     if ( zSig0 == 0 ) {
2724         zSig0 = zSig1;
2725         zSig1 = 0;
2726         zExp -= 64;
2727     }
2728     shiftCount = countLeadingZeros64( zSig0 );
2729     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2730     zExp -= shiftCount;
2731     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2732                                 zSig0, zSig1, status);
2733
2734 }
2735
2736 /*----------------------------------------------------------------------------
2737 | Returns the least-significant 64 fraction bits of the quadruple-precision
2738 | floating-point value `a'.
2739 *----------------------------------------------------------------------------*/
2740
2741 static inline uint64_t extractFloat128Frac1( float128 a )
2742 {
2743
2744     return a.low;
2745
2746 }
2747
2748 /*----------------------------------------------------------------------------
2749 | Returns the most-significant 48 fraction bits of the quadruple-precision
2750 | floating-point value `a'.
2751 *----------------------------------------------------------------------------*/
2752
2753 static inline uint64_t extractFloat128Frac0( float128 a )
2754 {
2755
2756     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2757
2758 }
2759
2760 /*----------------------------------------------------------------------------
2761 | Returns the exponent bits of the quadruple-precision floating-point value
2762 | `a'.
2763 *----------------------------------------------------------------------------*/
2764
2765 static inline int32_t extractFloat128Exp( float128 a )
2766 {
2767
2768     return ( a.high>>48 ) & 0x7FFF;
2769
2770 }
2771
2772 /*----------------------------------------------------------------------------
2773 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2774 *----------------------------------------------------------------------------*/
2775
2776 static inline flag extractFloat128Sign( float128 a )
2777 {
2778
2779     return a.high>>63;
2780
2781 }
2782
2783 /*----------------------------------------------------------------------------
2784 | Normalizes the subnormal quadruple-precision floating-point value
2785 | represented by the denormalized significand formed by the concatenation of
2786 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2787 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2788 | significand are stored at the location pointed to by `zSig0Ptr', and the
2789 | least significant 64 bits of the normalized significand are stored at the
2790 | location pointed to by `zSig1Ptr'.
2791 *----------------------------------------------------------------------------*/
2792
2793 static void
2794  normalizeFloat128Subnormal(
2795      uint64_t aSig0,
2796      uint64_t aSig1,
2797      int32_t *zExpPtr,
2798      uint64_t *zSig0Ptr,
2799      uint64_t *zSig1Ptr
2800  )
2801 {
2802     int8_t shiftCount;
2803
2804     if ( aSig0 == 0 ) {
2805         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2806         if ( shiftCount < 0 ) {
2807             *zSig0Ptr = aSig1>>( - shiftCount );
2808             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2809         }
2810         else {
2811             *zSig0Ptr = aSig1<<shiftCount;
2812             *zSig1Ptr = 0;
2813         }
2814         *zExpPtr = - shiftCount - 63;
2815     }
2816     else {
2817         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2818         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2819         *zExpPtr = 1 - shiftCount;
2820     }
2821
2822 }
2823
2824 /*----------------------------------------------------------------------------
2825 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2826 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2827 | floating-point value, returning the result.  After being shifted into the
2828 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2829 | added together to form the most significant 32 bits of the result.  This
2830 | means that any integer portion of `zSig0' will be added into the exponent.
2831 | Since a properly normalized significand will have an integer portion equal
2832 | to 1, the `zExp' input should be 1 less than the desired result exponent
2833 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2834 | significand.
2835 *----------------------------------------------------------------------------*/
2836
2837 static inline float128
2838  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2839 {
2840     float128 z;
2841
2842     z.low = zSig1;
2843     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2844     return z;
2845
2846 }
2847
2848 /*----------------------------------------------------------------------------
2849 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2850 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2851 | and `zSig2', and returns the proper quadruple-precision floating-point value
2852 | corresponding to the abstract input.  Ordinarily, the abstract value is
2853 | simply rounded and packed into the quadruple-precision format, with the
2854 | inexact exception raised if the abstract input cannot be represented
2855 | exactly.  However, if the abstract value is too large, the overflow and
2856 | inexact exceptions are raised and an infinity or maximal finite value is
2857 | returned.  If the abstract value is too small, the input value is rounded to
2858 | a subnormal number, and the underflow and inexact exceptions are raised if
2859 | the abstract input cannot be represented exactly as a subnormal quadruple-
2860 | precision floating-point number.
2861 |     The input significand must be normalized or smaller.  If the input
2862 | significand is not normalized, `zExp' must be 0; in that case, the result
2863 | returned is a subnormal number, and it must not require rounding.  In the
2864 | usual case that the input significand is normalized, `zExp' must be 1 less
2865 | than the ``true'' floating-point exponent.  The handling of underflow and
2866 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2867 *----------------------------------------------------------------------------*/
2868
2869 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2870                                      uint64_t zSig0, uint64_t zSig1,
2871                                      uint64_t zSig2, float_status *status)
2872 {
2873     int8_t roundingMode;
2874     flag roundNearestEven, increment, isTiny;
2875
2876     roundingMode = status->float_rounding_mode;
2877     roundNearestEven = ( roundingMode == float_round_nearest_even );
2878     switch (roundingMode) {
2879     case float_round_nearest_even:
2880     case float_round_ties_away:
2881         increment = ((int64_t)zSig2 < 0);
2882         break;
2883     case float_round_to_zero:
2884         increment = 0;
2885         break;
2886     case float_round_up:
2887         increment = !zSign && zSig2;
2888         break;
2889     case float_round_down:
2890         increment = zSign && zSig2;
2891         break;
2892     case float_round_to_odd:
2893         increment = !(zSig1 & 0x1) && zSig2;
2894         break;
2895     default:
2896         abort();
2897     }
2898     if ( 0x7FFD <= (uint32_t) zExp ) {
2899         if (    ( 0x7FFD < zExp )
2900              || (    ( zExp == 0x7FFD )
2901                   && eq128(
2902                          LIT64( 0x0001FFFFFFFFFFFF ),
2903                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2904                          zSig0,
2905                          zSig1
2906                      )
2907                   && increment
2908                 )
2909            ) {
2910             float_raise(float_flag_overflow | float_flag_inexact, status);
2911             if (    ( roundingMode == float_round_to_zero )
2912                  || ( zSign && ( roundingMode == float_round_up ) )
2913                  || ( ! zSign && ( roundingMode == float_round_down ) )
2914                  || (roundingMode == float_round_to_odd)
2915                ) {
2916                 return
2917                     packFloat128(
2918                         zSign,
2919                         0x7FFE,
2920                         LIT64( 0x0000FFFFFFFFFFFF ),
2921                         LIT64( 0xFFFFFFFFFFFFFFFF )
2922                     );
2923             }
2924             return packFloat128( zSign, 0x7FFF, 0, 0 );
2925         }
2926         if ( zExp < 0 ) {
2927             if (status->flush_to_zero) {
2928                 float_raise(float_flag_output_denormal, status);
2929                 return packFloat128(zSign, 0, 0, 0);
2930             }
2931             isTiny =
2932                    (status->float_detect_tininess
2933                     == float_tininess_before_rounding)
2934                 || ( zExp < -1 )
2935                 || ! increment
2936                 || lt128(
2937                        zSig0,
2938                        zSig1,
2939                        LIT64( 0x0001FFFFFFFFFFFF ),
2940                        LIT64( 0xFFFFFFFFFFFFFFFF )
2941                    );
2942             shift128ExtraRightJamming(
2943                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2944             zExp = 0;
2945             if (isTiny && zSig2) {
2946                 float_raise(float_flag_underflow, status);
2947             }
2948             switch (roundingMode) {
2949             case float_round_nearest_even:
2950             case float_round_ties_away:
2951                 increment = ((int64_t)zSig2 < 0);
2952                 break;
2953             case float_round_to_zero:
2954                 increment = 0;
2955                 break;
2956             case float_round_up:
2957                 increment = !zSign && zSig2;
2958                 break;
2959             case float_round_down:
2960                 increment = zSign && zSig2;
2961                 break;
2962             case float_round_to_odd:
2963                 increment = !(zSig1 & 0x1) && zSig2;
2964                 break;
2965             default:
2966                 abort();
2967             }
2968         }
2969     }
2970     if (zSig2) {
2971         status->float_exception_flags |= float_flag_inexact;
2972     }
2973     if ( increment ) {
2974         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2975         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2976     }
2977     else {
2978         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2979     }
2980     return packFloat128( zSign, zExp, zSig0, zSig1 );
2981
2982 }
2983
2984 /*----------------------------------------------------------------------------
2985 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2986 | and significand formed by the concatenation of `zSig0' and `zSig1', and
2987 | returns the proper quadruple-precision floating-point value corresponding
2988 | to the abstract input.  This routine is just like `roundAndPackFloat128'
2989 | except that the input significand has fewer bits and does not have to be
2990 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
2991 | point exponent.
2992 *----------------------------------------------------------------------------*/
2993
2994 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
2995                                               uint64_t zSig0, uint64_t zSig1,
2996                                               float_status *status)
2997 {
2998     int8_t shiftCount;
2999     uint64_t zSig2;
3000
3001     if ( zSig0 == 0 ) {
3002         zSig0 = zSig1;
3003         zSig1 = 0;
3004         zExp -= 64;
3005     }
3006     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3007     if ( 0 <= shiftCount ) {
3008         zSig2 = 0;
3009         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3010     }
3011     else {
3012         shift128ExtraRightJamming(
3013             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3014     }
3015     zExp -= shiftCount;
3016     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3017
3018 }
3019
3020
3021 /*----------------------------------------------------------------------------
3022 | Returns the result of converting the 32-bit two's complement integer `a'
3023 | to the extended double-precision floating-point format.  The conversion
3024 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3025 | Arithmetic.
3026 *----------------------------------------------------------------------------*/
3027
3028 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3029 {
3030     flag zSign;
3031     uint32_t absA;
3032     int8_t shiftCount;
3033     uint64_t zSig;
3034
3035     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3036     zSign = ( a < 0 );
3037     absA = zSign ? - a : a;
3038     shiftCount = countLeadingZeros32( absA ) + 32;
3039     zSig = absA;
3040     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3041
3042 }
3043
3044 /*----------------------------------------------------------------------------
3045 | Returns the result of converting the 32-bit two's complement integer `a' to
3046 | the quadruple-precision floating-point format.  The conversion is performed
3047 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3048 *----------------------------------------------------------------------------*/
3049
3050 float128 int32_to_float128(int32_t a, float_status *status)
3051 {
3052     flag zSign;
3053     uint32_t absA;
3054     int8_t shiftCount;
3055     uint64_t zSig0;
3056
3057     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3058     zSign = ( a < 0 );
3059     absA = zSign ? - a : a;
3060     shiftCount = countLeadingZeros32( absA ) + 17;
3061     zSig0 = absA;
3062     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3063
3064 }
3065
3066 /*----------------------------------------------------------------------------
3067 | Returns the result of converting the 64-bit two's complement integer `a'
3068 | to the extended double-precision floating-point format.  The conversion
3069 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3070 | Arithmetic.
3071 *----------------------------------------------------------------------------*/
3072
3073 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3074 {
3075     flag zSign;
3076     uint64_t absA;
3077     int8_t shiftCount;
3078
3079     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3080     zSign = ( a < 0 );
3081     absA = zSign ? - a : a;
3082     shiftCount = countLeadingZeros64( absA );
3083     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3084
3085 }
3086
3087 /*----------------------------------------------------------------------------
3088 | Returns the result of converting the 64-bit two's complement integer `a' to
3089 | the quadruple-precision floating-point format.  The conversion is performed
3090 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3091 *----------------------------------------------------------------------------*/
3092
3093 float128 int64_to_float128(int64_t a, float_status *status)
3094 {
3095     flag zSign;
3096     uint64_t absA;
3097     int8_t shiftCount;
3098     int32_t zExp;
3099     uint64_t zSig0, zSig1;
3100
3101     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3102     zSign = ( a < 0 );
3103     absA = zSign ? - a : a;
3104     shiftCount = countLeadingZeros64( absA ) + 49;
3105     zExp = 0x406E - shiftCount;
3106     if ( 64 <= shiftCount ) {
3107         zSig1 = 0;
3108         zSig0 = absA;
3109         shiftCount -= 64;
3110     }
3111     else {
3112         zSig1 = absA;
3113         zSig0 = 0;
3114     }
3115     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3116     return packFloat128( zSign, zExp, zSig0, zSig1 );
3117
3118 }
3119
3120 /*----------------------------------------------------------------------------
3121 | Returns the result of converting the 64-bit unsigned integer `a'
3122 | to the quadruple-precision floating-point format.  The conversion is performed
3123 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3124 *----------------------------------------------------------------------------*/
3125
3126 float128 uint64_to_float128(uint64_t a, float_status *status)
3127 {
3128     if (a == 0) {
3129         return float128_zero;
3130     }
3131     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3132 }
3133
3134
3135
3136
3137 /*----------------------------------------------------------------------------
3138 | Returns the result of converting the single-precision floating-point value
3139 | `a' to the double-precision floating-point format.  The conversion is
3140 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3141 | Arithmetic.
3142 *----------------------------------------------------------------------------*/
3143
3144 float64 float32_to_float64(float32 a, float_status *status)
3145 {
3146     flag aSign;
3147     int aExp;
3148     uint32_t aSig;
3149     a = float32_squash_input_denormal(a, status);
3150
3151     aSig = extractFloat32Frac( a );
3152     aExp = extractFloat32Exp( a );
3153     aSign = extractFloat32Sign( a );
3154     if ( aExp == 0xFF ) {
3155         if (aSig) {
3156             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3157         }
3158         return packFloat64( aSign, 0x7FF, 0 );
3159     }
3160     if ( aExp == 0 ) {
3161         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3162         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3163         --aExp;
3164     }
3165     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3166
3167 }
3168
3169 /*----------------------------------------------------------------------------
3170 | Returns the result of converting the single-precision floating-point value
3171 | `a' to the extended double-precision floating-point format.  The conversion
3172 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3173 | Arithmetic.
3174 *----------------------------------------------------------------------------*/
3175
3176 floatx80 float32_to_floatx80(float32 a, float_status *status)
3177 {
3178     flag aSign;
3179     int aExp;
3180     uint32_t aSig;
3181
3182     a = float32_squash_input_denormal(a, status);
3183     aSig = extractFloat32Frac( a );
3184     aExp = extractFloat32Exp( a );
3185     aSign = extractFloat32Sign( a );
3186     if ( aExp == 0xFF ) {
3187         if (aSig) {
3188             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3189         }
3190         return packFloatx80(aSign,
3191                             floatx80_infinity_high,
3192                             floatx80_infinity_low);
3193     }
3194     if ( aExp == 0 ) {
3195         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3196         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3197     }
3198     aSig |= 0x00800000;
3199     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3200
3201 }
3202
3203 /*----------------------------------------------------------------------------
3204 | Returns the result of converting the single-precision floating-point value
3205 | `a' to the double-precision floating-point format.  The conversion is
3206 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3207 | Arithmetic.
3208 *----------------------------------------------------------------------------*/
3209
3210 float128 float32_to_float128(float32 a, float_status *status)
3211 {
3212     flag aSign;
3213     int aExp;
3214     uint32_t aSig;
3215
3216     a = float32_squash_input_denormal(a, status);
3217     aSig = extractFloat32Frac( a );
3218     aExp = extractFloat32Exp( a );
3219     aSign = extractFloat32Sign( a );
3220     if ( aExp == 0xFF ) {
3221         if (aSig) {
3222             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3223         }
3224         return packFloat128( aSign, 0x7FFF, 0, 0 );
3225     }
3226     if ( aExp == 0 ) {
3227         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3228         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3229         --aExp;
3230     }
3231     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3232
3233 }
3234
3235 /*----------------------------------------------------------------------------
3236 | Returns the remainder of the single-precision floating-point value `a'
3237 | with respect to the corresponding value `b'.  The operation is performed
3238 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3239 *----------------------------------------------------------------------------*/
3240
3241 float32 float32_rem(float32 a, float32 b, float_status *status)
3242 {
3243     flag aSign, zSign;
3244     int aExp, bExp, expDiff;
3245     uint32_t aSig, bSig;
3246     uint32_t q;
3247     uint64_t aSig64, bSig64, q64;
3248     uint32_t alternateASig;
3249     int32_t sigMean;
3250     a = float32_squash_input_denormal(a, status);
3251     b = float32_squash_input_denormal(b, status);
3252
3253     aSig = extractFloat32Frac( a );
3254     aExp = extractFloat32Exp( a );
3255     aSign = extractFloat32Sign( a );
3256     bSig = extractFloat32Frac( b );
3257     bExp = extractFloat32Exp( b );
3258     if ( aExp == 0xFF ) {
3259         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3260             return propagateFloat32NaN(a, b, status);
3261         }
3262         float_raise(float_flag_invalid, status);
3263         return float32_default_nan(status);
3264     }
3265     if ( bExp == 0xFF ) {
3266         if (bSig) {
3267             return propagateFloat32NaN(a, b, status);
3268         }
3269         return a;
3270     }
3271     if ( bExp == 0 ) {
3272         if ( bSig == 0 ) {
3273             float_raise(float_flag_invalid, status);
3274             return float32_default_nan(status);
3275         }
3276         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3277     }
3278     if ( aExp == 0 ) {
3279         if ( aSig == 0 ) return a;
3280         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3281     }
3282     expDiff = aExp - bExp;
3283     aSig |= 0x00800000;
3284     bSig |= 0x00800000;
3285     if ( expDiff < 32 ) {
3286         aSig <<= 8;
3287         bSig <<= 8;
3288         if ( expDiff < 0 ) {
3289             if ( expDiff < -1 ) return a;
3290             aSig >>= 1;
3291         }
3292         q = ( bSig <= aSig );
3293         if ( q ) aSig -= bSig;
3294         if ( 0 < expDiff ) {
3295             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3296             q >>= 32 - expDiff;
3297             bSig >>= 2;
3298             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3299         }
3300         else {
3301             aSig >>= 2;
3302             bSig >>= 2;
3303         }
3304     }
3305     else {
3306         if ( bSig <= aSig ) aSig -= bSig;
3307         aSig64 = ( (uint64_t) aSig )<<40;
3308         bSig64 = ( (uint64_t) bSig )<<40;
3309         expDiff -= 64;
3310         while ( 0 < expDiff ) {
3311             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3312             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3313             aSig64 = - ( ( bSig * q64 )<<38 );
3314             expDiff -= 62;
3315         }
3316         expDiff += 64;
3317         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3318         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3319         q = q64>>( 64 - expDiff );
3320         bSig <<= 6;
3321         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3322     }
3323     do {
3324         alternateASig = aSig;
3325         ++q;
3326         aSig -= bSig;
3327     } while ( 0 <= (int32_t) aSig );
3328     sigMean = aSig + alternateASig;
3329     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3330         aSig = alternateASig;
3331     }
3332     zSign = ( (int32_t) aSig < 0 );
3333     if ( zSign ) aSig = - aSig;
3334     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3335 }
3336
3337
3338
3339 /*----------------------------------------------------------------------------
3340 | Returns the binary exponential of the single-precision floating-point value
3341 | `a'. The operation is performed according to the IEC/IEEE Standard for
3342 | Binary Floating-Point Arithmetic.
3343 |
3344 | Uses the following identities:
3345 |
3346 | 1. -------------------------------------------------------------------------
3347 |      x    x*ln(2)
3348 |     2  = e
3349 |
3350 | 2. -------------------------------------------------------------------------
3351 |                      2     3     4     5           n
3352 |      x        x     x     x     x     x           x
3353 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3354 |               1!    2!    3!    4!    5!          n!
3355 *----------------------------------------------------------------------------*/
3356
3357 static const float64 float32_exp2_coefficients[15] =
3358 {
3359     const_float64( 0x3ff0000000000000ll ), /*  1 */
3360     const_float64( 0x3fe0000000000000ll ), /*  2 */
3361     const_float64( 0x3fc5555555555555ll ), /*  3 */
3362     const_float64( 0x3fa5555555555555ll ), /*  4 */
3363     const_float64( 0x3f81111111111111ll ), /*  5 */
3364     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3365     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3366     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3367     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3368     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3369     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3370     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3371     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3372     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3373     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3374 };
3375
3376 float32 float32_exp2(float32 a, float_status *status)
3377 {
3378     flag aSign;
3379     int aExp;
3380     uint32_t aSig;
3381     float64 r, x, xn;
3382     int i;
3383     a = float32_squash_input_denormal(a, status);
3384
3385     aSig = extractFloat32Frac( a );
3386     aExp = extractFloat32Exp( a );
3387     aSign = extractFloat32Sign( a );
3388
3389     if ( aExp == 0xFF) {
3390         if (aSig) {
3391             return propagateFloat32NaN(a, float32_zero, status);
3392         }
3393         return (aSign) ? float32_zero : a;
3394     }
3395     if (aExp == 0) {
3396         if (aSig == 0) return float32_one;
3397     }
3398
3399     float_raise(float_flag_inexact, status);
3400
3401     /* ******************************* */
3402     /* using float64 for approximation */
3403     /* ******************************* */
3404     x = float32_to_float64(a, status);
3405     x = float64_mul(x, float64_ln2, status);
3406
3407     xn = x;
3408     r = float64_one;
3409     for (i = 0 ; i < 15 ; i++) {
3410         float64 f;
3411
3412         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3413         r = float64_add(r, f, status);
3414
3415         xn = float64_mul(xn, x, status);
3416     }
3417
3418     return float64_to_float32(r, status);
3419 }
3420
3421 /*----------------------------------------------------------------------------
3422 | Returns the binary log of the single-precision floating-point value `a'.
3423 | The operation is performed according to the IEC/IEEE Standard for Binary
3424 | Floating-Point Arithmetic.
3425 *----------------------------------------------------------------------------*/
3426 float32 float32_log2(float32 a, float_status *status)
3427 {
3428     flag aSign, zSign;
3429     int aExp;
3430     uint32_t aSig, zSig, i;
3431
3432     a = float32_squash_input_denormal(a, status);
3433     aSig = extractFloat32Frac( a );
3434     aExp = extractFloat32Exp( a );
3435     aSign = extractFloat32Sign( a );
3436
3437     if ( aExp == 0 ) {
3438         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3439         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3440     }
3441     if ( aSign ) {
3442         float_raise(float_flag_invalid, status);
3443         return float32_default_nan(status);
3444     }
3445     if ( aExp == 0xFF ) {
3446         if (aSig) {
3447             return propagateFloat32NaN(a, float32_zero, status);
3448         }
3449         return a;
3450     }
3451
3452     aExp -= 0x7F;
3453     aSig |= 0x00800000;
3454     zSign = aExp < 0;
3455     zSig = aExp << 23;
3456
3457     for (i = 1 << 22; i > 0; i >>= 1) {
3458         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3459         if ( aSig & 0x01000000 ) {
3460             aSig >>= 1;
3461             zSig |= i;
3462         }
3463     }
3464
3465     if ( zSign )
3466         zSig = -zSig;
3467
3468     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3469 }
3470
3471 /*----------------------------------------------------------------------------
3472 | Returns 1 if the single-precision floating-point value `a' is equal to
3473 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3474 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3475 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3476 *----------------------------------------------------------------------------*/
3477
3478 int float32_eq(float32 a, float32 b, float_status *status)
3479 {
3480     uint32_t av, bv;
3481     a = float32_squash_input_denormal(a, status);
3482     b = float32_squash_input_denormal(b, status);
3483
3484     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3485          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3486        ) {
3487         float_raise(float_flag_invalid, status);
3488         return 0;
3489     }
3490     av = float32_val(a);
3491     bv = float32_val(b);
3492     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3493 }
3494
3495 /*----------------------------------------------------------------------------
3496 | Returns 1 if the single-precision floating-point value `a' is less than
3497 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3498 | exception is raised if either operand is a NaN.  The comparison is performed
3499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3500 *----------------------------------------------------------------------------*/
3501
3502 int float32_le(float32 a, float32 b, float_status *status)
3503 {
3504     flag aSign, bSign;
3505     uint32_t av, bv;
3506     a = float32_squash_input_denormal(a, status);
3507     b = float32_squash_input_denormal(b, status);
3508
3509     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3510          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3511        ) {
3512         float_raise(float_flag_invalid, status);
3513         return 0;
3514     }
3515     aSign = extractFloat32Sign( a );
3516     bSign = extractFloat32Sign( b );
3517     av = float32_val(a);
3518     bv = float32_val(b);
3519     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3520     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3521
3522 }
3523
3524 /*----------------------------------------------------------------------------
3525 | Returns 1 if the single-precision floating-point value `a' is less than
3526 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3527 | raised if either operand is a NaN.  The comparison is performed according
3528 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3529 *----------------------------------------------------------------------------*/
3530
3531 int float32_lt(float32 a, float32 b, float_status *status)
3532 {
3533     flag aSign, bSign;
3534     uint32_t av, bv;
3535     a = float32_squash_input_denormal(a, status);
3536     b = float32_squash_input_denormal(b, status);
3537
3538     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3539          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3540        ) {
3541         float_raise(float_flag_invalid, status);
3542         return 0;
3543     }
3544     aSign = extractFloat32Sign( a );
3545     bSign = extractFloat32Sign( b );
3546     av = float32_val(a);
3547     bv = float32_val(b);
3548     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3549     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3550
3551 }
3552
3553 /*----------------------------------------------------------------------------
3554 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3555 | be compared, and 0 otherwise.  The invalid exception is raised if either
3556 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3557 | Standard for Binary Floating-Point Arithmetic.
3558 *----------------------------------------------------------------------------*/
3559
3560 int float32_unordered(float32 a, float32 b, float_status *status)
3561 {
3562     a = float32_squash_input_denormal(a, status);
3563     b = float32_squash_input_denormal(b, status);
3564
3565     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3566          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3567        ) {
3568         float_raise(float_flag_invalid, status);
3569         return 1;
3570     }
3571     return 0;
3572 }
3573
3574 /*----------------------------------------------------------------------------
3575 | Returns 1 if the single-precision floating-point value `a' is equal to
3576 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3577 | exception.  The comparison is performed according to the IEC/IEEE Standard
3578 | for Binary Floating-Point Arithmetic.
3579 *----------------------------------------------------------------------------*/
3580
3581 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3582 {
3583     a = float32_squash_input_denormal(a, status);
3584     b = float32_squash_input_denormal(b, status);
3585
3586     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3587          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3588        ) {
3589         if (float32_is_signaling_nan(a, status)
3590          || float32_is_signaling_nan(b, status)) {
3591             float_raise(float_flag_invalid, status);
3592         }
3593         return 0;
3594     }
3595     return ( float32_val(a) == float32_val(b) ) ||
3596             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3597 }
3598
3599 /*----------------------------------------------------------------------------
3600 | Returns 1 if the single-precision floating-point value `a' is less than or
3601 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3602 | cause an exception.  Otherwise, the comparison is performed according to the
3603 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3604 *----------------------------------------------------------------------------*/
3605
3606 int float32_le_quiet(float32 a, float32 b, float_status *status)
3607 {
3608     flag aSign, bSign;
3609     uint32_t av, bv;
3610     a = float32_squash_input_denormal(a, status);
3611     b = float32_squash_input_denormal(b, status);
3612
3613     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3614          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3615        ) {
3616         if (float32_is_signaling_nan(a, status)
3617          || float32_is_signaling_nan(b, status)) {
3618             float_raise(float_flag_invalid, status);
3619         }
3620         return 0;
3621     }
3622     aSign = extractFloat32Sign( a );
3623     bSign = extractFloat32Sign( b );
3624     av = float32_val(a);
3625     bv = float32_val(b);
3626     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3627     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3628
3629 }
3630
3631 /*----------------------------------------------------------------------------
3632 | Returns 1 if the single-precision floating-point value `a' is less than
3633 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3634 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3635 | Standard for Binary Floating-Point Arithmetic.
3636 *----------------------------------------------------------------------------*/
3637
3638 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3639 {
3640     flag aSign, bSign;
3641     uint32_t av, bv;
3642     a = float32_squash_input_denormal(a, status);
3643     b = float32_squash_input_denormal(b, status);
3644
3645     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3646          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3647        ) {
3648         if (float32_is_signaling_nan(a, status)
3649          || float32_is_signaling_nan(b, status)) {
3650             float_raise(float_flag_invalid, status);
3651         }
3652         return 0;
3653     }
3654     aSign = extractFloat32Sign( a );
3655     bSign = extractFloat32Sign( b );
3656     av = float32_val(a);
3657     bv = float32_val(b);
3658     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3659     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3660
3661 }
3662
3663 /*----------------------------------------------------------------------------
3664 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3665 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3666 | comparison is performed according to the IEC/IEEE Standard for Binary
3667 | Floating-Point Arithmetic.
3668 *----------------------------------------------------------------------------*/
3669
3670 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3671 {
3672     a = float32_squash_input_denormal(a, status);
3673     b = float32_squash_input_denormal(b, status);
3674
3675     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3676          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3677        ) {
3678         if (float32_is_signaling_nan(a, status)
3679          || float32_is_signaling_nan(b, status)) {
3680             float_raise(float_flag_invalid, status);
3681         }
3682         return 1;
3683     }
3684     return 0;
3685 }
3686
3687
3688 /*----------------------------------------------------------------------------
3689 | Returns the result of converting the double-precision floating-point value
3690 | `a' to the single-precision floating-point format.  The conversion is
3691 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3692 | Arithmetic.
3693 *----------------------------------------------------------------------------*/
3694
3695 float32 float64_to_float32(float64 a, float_status *status)
3696 {
3697     flag aSign;
3698     int aExp;
3699     uint64_t aSig;
3700     uint32_t zSig;
3701     a = float64_squash_input_denormal(a, status);
3702
3703     aSig = extractFloat64Frac( a );
3704     aExp = extractFloat64Exp( a );
3705     aSign = extractFloat64Sign( a );
3706     if ( aExp == 0x7FF ) {
3707         if (aSig) {
3708             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3709         }
3710         return packFloat32( aSign, 0xFF, 0 );
3711     }
3712     shift64RightJamming( aSig, 22, &aSig );
3713     zSig = aSig;
3714     if ( aExp || zSig ) {
3715         zSig |= 0x40000000;
3716         aExp -= 0x381;
3717     }
3718     return roundAndPackFloat32(aSign, aExp, zSig, status);
3719
3720 }
3721
3722
3723 /*----------------------------------------------------------------------------
3724 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3725 | half-precision floating-point value, returning the result.  After being
3726 | shifted into the proper positions, the three fields are simply added
3727 | together to form the result.  This means that any integer portion of `zSig'
3728 | will be added into the exponent.  Since a properly normalized significand
3729 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3730 | than the desired result exponent whenever `zSig' is a complete, normalized
3731 | significand.
3732 *----------------------------------------------------------------------------*/
3733 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3734 {
3735     return make_float16(
3736         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3737 }
3738
3739 /*----------------------------------------------------------------------------
3740 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3741 | and significand `zSig', and returns the proper half-precision floating-
3742 | point value corresponding to the abstract input.  Ordinarily, the abstract
3743 | value is simply rounded and packed into the half-precision format, with
3744 | the inexact exception raised if the abstract input cannot be represented
3745 | exactly.  However, if the abstract value is too large, the overflow and
3746 | inexact exceptions are raised and an infinity or maximal finite value is
3747 | returned.  If the abstract value is too small, the input value is rounded to
3748 | a subnormal number, and the underflow and inexact exceptions are raised if
3749 | the abstract input cannot be represented exactly as a subnormal half-
3750 | precision floating-point number.
3751 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3752 | ARM-style "alternative representation", which omits the NaN and Inf
3753 | encodings in order to raise the maximum representable exponent by one.
3754 |     The input significand `zSig' has its binary point between bits 22
3755 | and 23, which is 13 bits to the left of the usual location.  This shifted
3756 | significand must be normalized or smaller.  If `zSig' is not normalized,
3757 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3758 | and it must not require rounding.  In the usual case that `zSig' is
3759 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3760 | Note the slightly odd position of the binary point in zSig compared with the
3761 | other roundAndPackFloat functions. This should probably be fixed if we
3762 | need to implement more float16 routines than just conversion.
3763 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3764 | Binary Floating-Point Arithmetic.
3765 *----------------------------------------------------------------------------*/
3766
3767 static float16 roundAndPackFloat16(flag zSign, int zExp,
3768                                    uint32_t zSig, flag ieee,
3769                                    float_status *status)
3770 {
3771     int maxexp = ieee ? 29 : 30;
3772     uint32_t mask;
3773     uint32_t increment;
3774     bool rounding_bumps_exp;
3775     bool is_tiny = false;
3776
3777     /* Calculate the mask of bits of the mantissa which are not
3778      * representable in half-precision and will be lost.
3779      */
3780     if (zExp < 1) {
3781         /* Will be denormal in halfprec */
3782         mask = 0x00ffffff;
3783         if (zExp >= -11) {
3784             mask >>= 11 + zExp;
3785         }
3786     } else {
3787         /* Normal number in halfprec */
3788         mask = 0x00001fff;
3789     }
3790
3791     switch (status->float_rounding_mode) {
3792     case float_round_nearest_even:
3793         increment = (mask + 1) >> 1;
3794         if ((zSig & mask) == increment) {
3795             increment = zSig & (increment << 1);
3796         }
3797         break;
3798     case float_round_ties_away:
3799         increment = (mask + 1) >> 1;
3800         break;
3801     case float_round_up:
3802         increment = zSign ? 0 : mask;
3803         break;
3804     case float_round_down:
3805         increment = zSign ? mask : 0;
3806         break;
3807     default: /* round_to_zero */
3808         increment = 0;
3809         break;
3810     }
3811
3812     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3813
3814     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3815         if (ieee) {
3816             float_raise(float_flag_overflow | float_flag_inexact, status);
3817             return packFloat16(zSign, 0x1f, 0);
3818         } else {
3819             float_raise(float_flag_invalid, status);
3820             return packFloat16(zSign, 0x1f, 0x3ff);
3821         }
3822     }
3823
3824     if (zExp < 0) {
3825         /* Note that flush-to-zero does not affect half-precision results */
3826         is_tiny =
3827             (status->float_detect_tininess == float_tininess_before_rounding)
3828             || (zExp < -1)
3829             || (!rounding_bumps_exp);
3830     }
3831     if (zSig & mask) {
3832         float_raise(float_flag_inexact, status);
3833         if (is_tiny) {
3834             float_raise(float_flag_underflow, status);
3835         }
3836     }
3837
3838     zSig += increment;
3839     if (rounding_bumps_exp) {
3840         zSig >>= 1;
3841         zExp++;
3842     }
3843
3844     if (zExp < -10) {
3845         return packFloat16(zSign, 0, 0);
3846     }
3847     if (zExp < 0) {
3848         zSig >>= -zExp;
3849         zExp = 0;
3850     }
3851     return packFloat16(zSign, zExp, zSig >> 13);
3852 }
3853
3854 /*----------------------------------------------------------------------------
3855 | If `a' is denormal and we are in flush-to-zero mode then set the
3856 | input-denormal exception and return zero. Otherwise just return the value.
3857 *----------------------------------------------------------------------------*/
3858 float16 float16_squash_input_denormal(float16 a, float_status *status)
3859 {
3860     if (status->flush_inputs_to_zero) {
3861         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3862             float_raise(float_flag_input_denormal, status);
3863             return make_float16(float16_val(a) & 0x8000);
3864         }
3865     }
3866     return a;
3867 }
3868
3869 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3870                                       uint32_t *zSigPtr)
3871 {
3872     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3873     *zSigPtr = aSig << shiftCount;
3874     *zExpPtr = 1 - shiftCount;
3875 }
3876
3877 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3878    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3879
3880 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3881 {
3882     flag aSign;
3883     int aExp;
3884     uint32_t aSig;
3885
3886     aSign = extractFloat16Sign(a);
3887     aExp = extractFloat16Exp(a);
3888     aSig = extractFloat16Frac(a);
3889
3890     if (aExp == 0x1f && ieee) {
3891         if (aSig) {
3892             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3893         }
3894         return packFloat32(aSign, 0xff, 0);
3895     }
3896     if (aExp == 0) {
3897         if (aSig == 0) {
3898             return packFloat32(aSign, 0, 0);
3899         }
3900
3901         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3902         aExp--;
3903     }
3904     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3905 }
3906
3907 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3908 {
3909     flag aSign;
3910     int aExp;
3911     uint32_t aSig;
3912
3913     a = float32_squash_input_denormal(a, status);
3914
3915     aSig = extractFloat32Frac( a );
3916     aExp = extractFloat32Exp( a );
3917     aSign = extractFloat32Sign( a );
3918     if ( aExp == 0xFF ) {
3919         if (aSig) {
3920             /* Input is a NaN */
3921             if (!ieee) {
3922                 float_raise(float_flag_invalid, status);
3923                 return packFloat16(aSign, 0, 0);
3924             }
3925             return commonNaNToFloat16(
3926                 float32ToCommonNaN(a, status), status);
3927         }
3928         /* Infinity */
3929         if (!ieee) {
3930             float_raise(float_flag_invalid, status);
3931             return packFloat16(aSign, 0x1f, 0x3ff);
3932         }
3933         return packFloat16(aSign, 0x1f, 0);
3934     }
3935     if (aExp == 0 && aSig == 0) {
3936         return packFloat16(aSign, 0, 0);
3937     }
3938     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3939      * even if the input is denormal; however this is harmless because
3940      * the largest possible single-precision denormal is still smaller
3941      * than the smallest representable half-precision denormal, and so we
3942      * will end up ignoring aSig and returning via the "always return zero"
3943      * codepath.
3944      */
3945     aSig |= 0x00800000;
3946     aExp -= 0x71;
3947
3948     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3949 }
3950
3951 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3952 {
3953     flag aSign;
3954     int aExp;
3955     uint32_t aSig;
3956
3957     aSign = extractFloat16Sign(a);
3958     aExp = extractFloat16Exp(a);
3959     aSig = extractFloat16Frac(a);
3960
3961     if (aExp == 0x1f && ieee) {
3962         if (aSig) {
3963             return commonNaNToFloat64(
3964                 float16ToCommonNaN(a, status), status);
3965         }
3966         return packFloat64(aSign, 0x7ff, 0);
3967     }
3968     if (aExp == 0) {
3969         if (aSig == 0) {
3970             return packFloat64(aSign, 0, 0);
3971         }
3972
3973         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3974         aExp--;
3975     }
3976     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3977 }
3978
3979 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3980 {
3981     flag aSign;
3982     int aExp;
3983     uint64_t aSig;
3984     uint32_t zSig;
3985
3986     a = float64_squash_input_denormal(a, status);
3987
3988     aSig = extractFloat64Frac(a);
3989     aExp = extractFloat64Exp(a);
3990     aSign = extractFloat64Sign(a);
3991     if (aExp == 0x7FF) {
3992         if (aSig) {
3993             /* Input is a NaN */
3994             if (!ieee) {
3995                 float_raise(float_flag_invalid, status);
3996                 return packFloat16(aSign, 0, 0);
3997             }
3998             return commonNaNToFloat16(
3999                 float64ToCommonNaN(a, status), status);
4000         }
4001         /* Infinity */
4002         if (!ieee) {
4003             float_raise(float_flag_invalid, status);
4004             return packFloat16(aSign, 0x1f, 0x3ff);
4005         }
4006         return packFloat16(aSign, 0x1f, 0);
4007     }
4008     shift64RightJamming(aSig, 29, &aSig);
4009     zSig = aSig;
4010     if (aExp == 0 && zSig == 0) {
4011         return packFloat16(aSign, 0, 0);
4012     }
4013     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4014      * even if the input is denormal; however this is harmless because
4015      * the largest possible single-precision denormal is still smaller
4016      * than the smallest representable half-precision denormal, and so we
4017      * will end up ignoring aSig and returning via the "always return zero"
4018      * codepath.
4019      */
4020     zSig |= 0x00800000;
4021     aExp -= 0x3F1;
4022
4023     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4024 }
4025
4026 /*----------------------------------------------------------------------------
4027 | Returns the result of converting the double-precision floating-point value
4028 | `a' to the extended double-precision floating-point format.  The conversion
4029 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4030 | Arithmetic.
4031 *----------------------------------------------------------------------------*/
4032
4033 floatx80 float64_to_floatx80(float64 a, float_status *status)
4034 {
4035     flag aSign;
4036     int aExp;
4037     uint64_t aSig;
4038
4039     a = float64_squash_input_denormal(a, status);
4040     aSig = extractFloat64Frac( a );
4041     aExp = extractFloat64Exp( a );
4042     aSign = extractFloat64Sign( a );
4043     if ( aExp == 0x7FF ) {
4044         if (aSig) {
4045             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4046         }
4047         return packFloatx80(aSign,
4048                             floatx80_infinity_high,
4049                             floatx80_infinity_low);
4050     }
4051     if ( aExp == 0 ) {
4052         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4053         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4054     }
4055     return
4056         packFloatx80(
4057             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4058
4059 }
4060
4061 /*----------------------------------------------------------------------------
4062 | Returns the result of converting the double-precision floating-point value
4063 | `a' to the quadruple-precision floating-point format.  The conversion is
4064 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4065 | Arithmetic.
4066 *----------------------------------------------------------------------------*/
4067
4068 float128 float64_to_float128(float64 a, float_status *status)
4069 {
4070     flag aSign;
4071     int aExp;
4072     uint64_t aSig, zSig0, zSig1;
4073
4074     a = float64_squash_input_denormal(a, status);
4075     aSig = extractFloat64Frac( a );
4076     aExp = extractFloat64Exp( a );
4077     aSign = extractFloat64Sign( a );
4078     if ( aExp == 0x7FF ) {
4079         if (aSig) {
4080             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4081         }
4082         return packFloat128( aSign, 0x7FFF, 0, 0 );
4083     }
4084     if ( aExp == 0 ) {
4085         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4086         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4087         --aExp;
4088     }
4089     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4090     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4091
4092 }
4093
4094
4095 /*----------------------------------------------------------------------------
4096 | Returns the remainder of the double-precision floating-point value `a'
4097 | with respect to the corresponding value `b'.  The operation is performed
4098 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4099 *----------------------------------------------------------------------------*/
4100
4101 float64 float64_rem(float64 a, float64 b, float_status *status)
4102 {
4103     flag aSign, zSign;
4104     int aExp, bExp, expDiff;
4105     uint64_t aSig, bSig;
4106     uint64_t q, alternateASig;
4107     int64_t sigMean;
4108
4109     a = float64_squash_input_denormal(a, status);
4110     b = float64_squash_input_denormal(b, status);
4111     aSig = extractFloat64Frac( a );
4112     aExp = extractFloat64Exp( a );
4113     aSign = extractFloat64Sign( a );
4114     bSig = extractFloat64Frac( b );
4115     bExp = extractFloat64Exp( b );
4116     if ( aExp == 0x7FF ) {
4117         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4118             return propagateFloat64NaN(a, b, status);
4119         }
4120         float_raise(float_flag_invalid, status);
4121         return float64_default_nan(status);
4122     }
4123     if ( bExp == 0x7FF ) {
4124         if (bSig) {
4125             return propagateFloat64NaN(a, b, status);
4126         }
4127         return a;
4128     }
4129     if ( bExp == 0 ) {
4130         if ( bSig == 0 ) {
4131             float_raise(float_flag_invalid, status);
4132             return float64_default_nan(status);
4133         }
4134         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4135     }
4136     if ( aExp == 0 ) {
4137         if ( aSig == 0 ) return a;
4138         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4139     }
4140     expDiff = aExp - bExp;
4141     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4142     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4143     if ( expDiff < 0 ) {
4144         if ( expDiff < -1 ) return a;
4145         aSig >>= 1;
4146     }
4147     q = ( bSig <= aSig );
4148     if ( q ) aSig -= bSig;
4149     expDiff -= 64;
4150     while ( 0 < expDiff ) {
4151         q = estimateDiv128To64( aSig, 0, bSig );
4152         q = ( 2 < q ) ? q - 2 : 0;
4153         aSig = - ( ( bSig>>2 ) * q );
4154         expDiff -= 62;
4155     }
4156     expDiff += 64;
4157     if ( 0 < expDiff ) {
4158         q = estimateDiv128To64( aSig, 0, bSig );
4159         q = ( 2 < q ) ? q - 2 : 0;
4160         q >>= 64 - expDiff;
4161         bSig >>= 2;
4162         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4163     }
4164     else {
4165         aSig >>= 2;
4166         bSig >>= 2;
4167     }
4168     do {
4169         alternateASig = aSig;
4170         ++q;
4171         aSig -= bSig;
4172     } while ( 0 <= (int64_t) aSig );
4173     sigMean = aSig + alternateASig;
4174     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4175         aSig = alternateASig;
4176     }
4177     zSign = ( (int64_t) aSig < 0 );
4178     if ( zSign ) aSig = - aSig;
4179     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4180
4181 }
4182
4183 /*----------------------------------------------------------------------------
4184 | Returns the binary log of the double-precision floating-point value `a'.
4185 | The operation is performed according to the IEC/IEEE Standard for Binary
4186 | Floating-Point Arithmetic.
4187 *----------------------------------------------------------------------------*/
4188 float64 float64_log2(float64 a, float_status *status)
4189 {
4190     flag aSign, zSign;
4191     int aExp;
4192     uint64_t aSig, aSig0, aSig1, zSig, i;
4193     a = float64_squash_input_denormal(a, status);
4194
4195     aSig = extractFloat64Frac( a );
4196     aExp = extractFloat64Exp( a );
4197     aSign = extractFloat64Sign( a );
4198
4199     if ( aExp == 0 ) {
4200         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4201         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4202     }
4203     if ( aSign ) {
4204         float_raise(float_flag_invalid, status);
4205         return float64_default_nan(status);
4206     }
4207     if ( aExp == 0x7FF ) {
4208         if (aSig) {
4209             return propagateFloat64NaN(a, float64_zero, status);
4210         }
4211         return a;
4212     }
4213
4214     aExp -= 0x3FF;
4215     aSig |= LIT64( 0x0010000000000000 );
4216     zSign = aExp < 0;
4217     zSig = (uint64_t)aExp << 52;
4218     for (i = 1LL << 51; i > 0; i >>= 1) {
4219         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4220         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4221         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4222             aSig >>= 1;
4223             zSig |= i;
4224         }
4225     }
4226
4227     if ( zSign )
4228         zSig = -zSig;
4229     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4230 }
4231
4232 /*----------------------------------------------------------------------------
4233 | Returns 1 if the double-precision floating-point value `a' is equal to the
4234 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4235 | if either operand is a NaN.  Otherwise, the comparison is performed
4236 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4237 *----------------------------------------------------------------------------*/
4238
4239 int float64_eq(float64 a, float64 b, float_status *status)
4240 {
4241     uint64_t av, bv;
4242     a = float64_squash_input_denormal(a, status);
4243     b = float64_squash_input_denormal(b, status);
4244
4245     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4246          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4247        ) {
4248         float_raise(float_flag_invalid, status);
4249         return 0;
4250     }
4251     av = float64_val(a);
4252     bv = float64_val(b);
4253     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4254
4255 }
4256
4257 /*----------------------------------------------------------------------------
4258 | Returns 1 if the double-precision floating-point value `a' is less than or
4259 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4260 | exception is raised if either operand is a NaN.  The comparison is performed
4261 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4262 *----------------------------------------------------------------------------*/
4263
4264 int float64_le(float64 a, float64 b, float_status *status)
4265 {
4266     flag aSign, bSign;
4267     uint64_t av, bv;
4268     a = float64_squash_input_denormal(a, status);
4269     b = float64_squash_input_denormal(b, status);
4270
4271     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4272          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4273        ) {
4274         float_raise(float_flag_invalid, status);
4275         return 0;
4276     }
4277     aSign = extractFloat64Sign( a );
4278     bSign = extractFloat64Sign( b );
4279     av = float64_val(a);
4280     bv = float64_val(b);
4281     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4282     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4283
4284 }
4285
4286 /*----------------------------------------------------------------------------
4287 | Returns 1 if the double-precision floating-point value `a' is less than
4288 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4289 | raised if either operand is a NaN.  The comparison is performed according
4290 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4291 *----------------------------------------------------------------------------*/
4292
4293 int float64_lt(float64 a, float64 b, float_status *status)
4294 {
4295     flag aSign, bSign;
4296     uint64_t av, bv;
4297
4298     a = float64_squash_input_denormal(a, status);
4299     b = float64_squash_input_denormal(b, status);
4300     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4301          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4302        ) {
4303         float_raise(float_flag_invalid, status);
4304         return 0;
4305     }
4306     aSign = extractFloat64Sign( a );
4307     bSign = extractFloat64Sign( b );
4308     av = float64_val(a);
4309     bv = float64_val(b);
4310     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4311     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4312
4313 }
4314
4315 /*----------------------------------------------------------------------------
4316 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4317 | be compared, and 0 otherwise.  The invalid exception is raised if either
4318 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4319 | Standard for Binary Floating-Point Arithmetic.
4320 *----------------------------------------------------------------------------*/
4321
4322 int float64_unordered(float64 a, float64 b, float_status *status)
4323 {
4324     a = float64_squash_input_denormal(a, status);
4325     b = float64_squash_input_denormal(b, status);
4326
4327     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4328          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4329        ) {
4330         float_raise(float_flag_invalid, status);
4331         return 1;
4332     }
4333     return 0;
4334 }
4335
4336 /*----------------------------------------------------------------------------
4337 | Returns 1 if the double-precision floating-point value `a' is equal to the
4338 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4339 | exception.The comparison is performed according to the IEC/IEEE Standard
4340 | for Binary Floating-Point Arithmetic.
4341 *----------------------------------------------------------------------------*/
4342
4343 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4344 {
4345     uint64_t av, bv;
4346     a = float64_squash_input_denormal(a, status);
4347     b = float64_squash_input_denormal(b, status);
4348
4349     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4350          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4351        ) {
4352         if (float64_is_signaling_nan(a, status)
4353          || float64_is_signaling_nan(b, status)) {
4354             float_raise(float_flag_invalid, status);
4355         }
4356         return 0;
4357     }
4358     av = float64_val(a);
4359     bv = float64_val(b);
4360     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4361
4362 }
4363
4364 /*----------------------------------------------------------------------------
4365 | Returns 1 if the double-precision floating-point value `a' is less than or
4366 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4367 | cause an exception.  Otherwise, the comparison is performed according to the
4368 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4369 *----------------------------------------------------------------------------*/
4370
4371 int float64_le_quiet(float64 a, float64 b, float_status *status)
4372 {
4373     flag aSign, bSign;
4374     uint64_t av, bv;
4375     a = float64_squash_input_denormal(a, status);
4376     b = float64_squash_input_denormal(b, status);
4377
4378     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4379          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4380        ) {
4381         if (float64_is_signaling_nan(a, status)
4382          || float64_is_signaling_nan(b, status)) {
4383             float_raise(float_flag_invalid, status);
4384         }
4385         return 0;
4386     }
4387     aSign = extractFloat64Sign( a );
4388     bSign = extractFloat64Sign( b );
4389     av = float64_val(a);
4390     bv = float64_val(b);
4391     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4392     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4393
4394 }
4395
4396 /*----------------------------------------------------------------------------
4397 | Returns 1 if the double-precision floating-point value `a' is less than
4398 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4399 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4400 | Standard for Binary Floating-Point Arithmetic.
4401 *----------------------------------------------------------------------------*/
4402
4403 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4404 {
4405     flag aSign, bSign;
4406     uint64_t av, bv;
4407     a = float64_squash_input_denormal(a, status);
4408     b = float64_squash_input_denormal(b, status);
4409
4410     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4411          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4412        ) {
4413         if (float64_is_signaling_nan(a, status)
4414          || float64_is_signaling_nan(b, status)) {
4415             float_raise(float_flag_invalid, status);
4416         }
4417         return 0;
4418     }
4419     aSign = extractFloat64Sign( a );
4420     bSign = extractFloat64Sign( b );
4421     av = float64_val(a);
4422     bv = float64_val(b);
4423     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4424     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4425
4426 }
4427
4428 /*----------------------------------------------------------------------------
4429 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4430 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4431 | comparison is performed according to the IEC/IEEE Standard for Binary
4432 | Floating-Point Arithmetic.
4433 *----------------------------------------------------------------------------*/
4434
4435 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4436 {
4437     a = float64_squash_input_denormal(a, status);
4438     b = float64_squash_input_denormal(b, status);
4439
4440     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4441          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4442        ) {
4443         if (float64_is_signaling_nan(a, status)
4444          || float64_is_signaling_nan(b, status)) {
4445             float_raise(float_flag_invalid, status);
4446         }
4447         return 1;
4448     }
4449     return 0;
4450 }
4451
4452 /*----------------------------------------------------------------------------
4453 | Returns the result of converting the extended double-precision floating-
4454 | point value `a' to the 32-bit two's complement integer format.  The
4455 | conversion is performed according to the IEC/IEEE Standard for Binary
4456 | Floating-Point Arithmetic---which means in particular that the conversion
4457 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4458 | largest positive integer is returned.  Otherwise, if the conversion
4459 | overflows, the largest integer with the same sign as `a' is returned.
4460 *----------------------------------------------------------------------------*/
4461
4462 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4463 {
4464     flag aSign;
4465     int32_t aExp, shiftCount;
4466     uint64_t aSig;
4467
4468     if (floatx80_invalid_encoding(a)) {
4469         float_raise(float_flag_invalid, status);
4470         return 1 << 31;
4471     }
4472     aSig = extractFloatx80Frac( a );
4473     aExp = extractFloatx80Exp( a );
4474     aSign = extractFloatx80Sign( a );
4475     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4476     shiftCount = 0x4037 - aExp;
4477     if ( shiftCount <= 0 ) shiftCount = 1;
4478     shift64RightJamming( aSig, shiftCount, &aSig );
4479     return roundAndPackInt32(aSign, aSig, status);
4480
4481 }
4482
4483 /*----------------------------------------------------------------------------
4484 | Returns the result of converting the extended double-precision floating-
4485 | point value `a' to the 32-bit two's complement integer format.  The
4486 | conversion is performed according to the IEC/IEEE Standard for Binary
4487 | Floating-Point Arithmetic, except that the conversion is always rounded
4488 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4489 | Otherwise, if the conversion overflows, the largest integer with the same
4490 | sign as `a' is returned.
4491 *----------------------------------------------------------------------------*/
4492
4493 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4494 {
4495     flag aSign;
4496     int32_t aExp, shiftCount;
4497     uint64_t aSig, savedASig;
4498     int32_t z;
4499
4500     if (floatx80_invalid_encoding(a)) {
4501         float_raise(float_flag_invalid, status);
4502         return 1 << 31;
4503     }
4504     aSig = extractFloatx80Frac( a );
4505     aExp = extractFloatx80Exp( a );
4506     aSign = extractFloatx80Sign( a );
4507     if ( 0x401E < aExp ) {
4508         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4509         goto invalid;
4510     }
4511     else if ( aExp < 0x3FFF ) {
4512         if (aExp || aSig) {
4513             status->float_exception_flags |= float_flag_inexact;
4514         }
4515         return 0;
4516     }
4517     shiftCount = 0x403E - aExp;
4518     savedASig = aSig;
4519     aSig >>= shiftCount;
4520     z = aSig;
4521     if ( aSign ) z = - z;
4522     if ( ( z < 0 ) ^ aSign ) {
4523  invalid:
4524         float_raise(float_flag_invalid, status);
4525         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4526     }
4527     if ( ( aSig<<shiftCount ) != savedASig ) {
4528         status->float_exception_flags |= float_flag_inexact;
4529     }
4530     return z;
4531
4532 }
4533
4534 /*----------------------------------------------------------------------------
4535 | Returns the result of converting the extended double-precision floating-
4536 | point value `a' to the 64-bit two's complement integer format.  The
4537 | conversion is performed according to the IEC/IEEE Standard for Binary
4538 | Floating-Point Arithmetic---which means in particular that the conversion
4539 | is rounded according to the current rounding mode.  If `a' is a NaN,
4540 | the largest positive integer is returned.  Otherwise, if the conversion
4541 | overflows, the largest integer with the same sign as `a' is returned.
4542 *----------------------------------------------------------------------------*/
4543
4544 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4545 {
4546     flag aSign;
4547     int32_t aExp, shiftCount;
4548     uint64_t aSig, aSigExtra;
4549
4550     if (floatx80_invalid_encoding(a)) {
4551         float_raise(float_flag_invalid, status);
4552         return 1ULL << 63;
4553     }
4554     aSig = extractFloatx80Frac( a );
4555     aExp = extractFloatx80Exp( a );
4556     aSign = extractFloatx80Sign( a );
4557     shiftCount = 0x403E - aExp;
4558     if ( shiftCount <= 0 ) {
4559         if ( shiftCount ) {
4560             float_raise(float_flag_invalid, status);
4561             if (!aSign || floatx80_is_any_nan(a)) {
4562                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4563             }
4564             return (int64_t) LIT64( 0x8000000000000000 );
4565         }
4566         aSigExtra = 0;
4567     }
4568     else {
4569         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4570     }
4571     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4572
4573 }
4574
4575 /*----------------------------------------------------------------------------
4576 | Returns the result of converting the extended double-precision floating-
4577 | point value `a' to the 64-bit two's complement integer format.  The
4578 | conversion is performed according to the IEC/IEEE Standard for Binary
4579 | Floating-Point Arithmetic, except that the conversion is always rounded
4580 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4581 | Otherwise, if the conversion overflows, the largest integer with the same
4582 | sign as `a' is returned.
4583 *----------------------------------------------------------------------------*/
4584
4585 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4586 {
4587     flag aSign;
4588     int32_t aExp, shiftCount;
4589     uint64_t aSig;
4590     int64_t z;
4591
4592     if (floatx80_invalid_encoding(a)) {
4593         float_raise(float_flag_invalid, status);
4594         return 1ULL << 63;
4595     }
4596     aSig = extractFloatx80Frac( a );
4597     aExp = extractFloatx80Exp( a );
4598     aSign = extractFloatx80Sign( a );
4599     shiftCount = aExp - 0x403E;
4600     if ( 0 <= shiftCount ) {
4601         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4602         if ( ( a.high != 0xC03E ) || aSig ) {
4603             float_raise(float_flag_invalid, status);
4604             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4605                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4606             }
4607         }
4608         return (int64_t) LIT64( 0x8000000000000000 );
4609     }
4610     else if ( aExp < 0x3FFF ) {
4611         if (aExp | aSig) {
4612             status->float_exception_flags |= float_flag_inexact;
4613         }
4614         return 0;
4615     }
4616     z = aSig>>( - shiftCount );
4617     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4618         status->float_exception_flags |= float_flag_inexact;
4619     }
4620     if ( aSign ) z = - z;
4621     return z;
4622
4623 }
4624
4625 /*----------------------------------------------------------------------------
4626 | Returns the result of converting the extended double-precision floating-
4627 | point value `a' to the single-precision floating-point format.  The
4628 | conversion is performed according to the IEC/IEEE Standard for Binary
4629 | Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4631
4632 float32 floatx80_to_float32(floatx80 a, float_status *status)
4633 {
4634     flag aSign;
4635     int32_t aExp;
4636     uint64_t aSig;
4637
4638     if (floatx80_invalid_encoding(a)) {
4639         float_raise(float_flag_invalid, status);
4640         return float32_default_nan(status);
4641     }
4642     aSig = extractFloatx80Frac( a );
4643     aExp = extractFloatx80Exp( a );
4644     aSign = extractFloatx80Sign( a );
4645     if ( aExp == 0x7FFF ) {
4646         if ( (uint64_t) ( aSig<<1 ) ) {
4647             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4648         }
4649         return packFloat32( aSign, 0xFF, 0 );
4650     }
4651     shift64RightJamming( aSig, 33, &aSig );
4652     if ( aExp || aSig ) aExp -= 0x3F81;
4653     return roundAndPackFloat32(aSign, aExp, aSig, status);
4654
4655 }
4656
4657 /*----------------------------------------------------------------------------
4658 | Returns the result of converting the extended double-precision floating-
4659 | point value `a' to the double-precision floating-point format.  The
4660 | conversion is performed according to the IEC/IEEE Standard for Binary
4661 | Floating-Point Arithmetic.
4662 *----------------------------------------------------------------------------*/
4663
4664 float64 floatx80_to_float64(floatx80 a, float_status *status)
4665 {
4666     flag aSign;
4667     int32_t aExp;
4668     uint64_t aSig, zSig;
4669
4670     if (floatx80_invalid_encoding(a)) {
4671         float_raise(float_flag_invalid, status);
4672         return float64_default_nan(status);
4673     }
4674     aSig = extractFloatx80Frac( a );
4675     aExp = extractFloatx80Exp( a );
4676     aSign = extractFloatx80Sign( a );
4677     if ( aExp == 0x7FFF ) {
4678         if ( (uint64_t) ( aSig<<1 ) ) {
4679             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4680         }
4681         return packFloat64( aSign, 0x7FF, 0 );
4682     }
4683     shift64RightJamming( aSig, 1, &zSig );
4684     if ( aExp || aSig ) aExp -= 0x3C01;
4685     return roundAndPackFloat64(aSign, aExp, zSig, status);
4686
4687 }
4688
4689 /*----------------------------------------------------------------------------
4690 | Returns the result of converting the extended double-precision floating-
4691 | point value `a' to the quadruple-precision floating-point format.  The
4692 | conversion is performed according to the IEC/IEEE Standard for Binary
4693 | Floating-Point Arithmetic.
4694 *----------------------------------------------------------------------------*/
4695
4696 float128 floatx80_to_float128(floatx80 a, float_status *status)
4697 {
4698     flag aSign;
4699     int aExp;
4700     uint64_t aSig, zSig0, zSig1;
4701
4702     if (floatx80_invalid_encoding(a)) {
4703         float_raise(float_flag_invalid, status);
4704         return float128_default_nan(status);
4705     }
4706     aSig = extractFloatx80Frac( a );
4707     aExp = extractFloatx80Exp( a );
4708     aSign = extractFloatx80Sign( a );
4709     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4710         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4711     }
4712     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4713     return packFloat128( aSign, aExp, zSig0, zSig1 );
4714
4715 }
4716
4717 /*----------------------------------------------------------------------------
4718 | Rounds the extended double-precision floating-point value `a'
4719 | to the precision provided by floatx80_rounding_precision and returns the
4720 | result as an extended double-precision floating-point value.
4721 | The operation is performed according to the IEC/IEEE Standard for Binary
4722 | Floating-Point Arithmetic.
4723 *----------------------------------------------------------------------------*/
4724
4725 floatx80 floatx80_round(floatx80 a, float_status *status)
4726 {
4727     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4728                                 extractFloatx80Sign(a),
4729                                 extractFloatx80Exp(a),
4730                                 extractFloatx80Frac(a), 0, status);
4731 }
4732
4733 /*----------------------------------------------------------------------------
4734 | Rounds the extended double-precision floating-point value `a' to an integer,
4735 | and returns the result as an extended quadruple-precision floating-point
4736 | value.  The operation is performed according to the IEC/IEEE Standard for
4737 | Binary Floating-Point Arithmetic.
4738 *----------------------------------------------------------------------------*/
4739
4740 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4741 {
4742     flag aSign;
4743     int32_t aExp;
4744     uint64_t lastBitMask, roundBitsMask;
4745     floatx80 z;
4746
4747     if (floatx80_invalid_encoding(a)) {
4748         float_raise(float_flag_invalid, status);
4749         return floatx80_default_nan(status);
4750     }
4751     aExp = extractFloatx80Exp( a );
4752     if ( 0x403E <= aExp ) {
4753         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4754             return propagateFloatx80NaN(a, a, status);
4755         }
4756         return a;
4757     }
4758     if ( aExp < 0x3FFF ) {
4759         if (    ( aExp == 0 )
4760              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4761             return a;
4762         }
4763         status->float_exception_flags |= float_flag_inexact;
4764         aSign = extractFloatx80Sign( a );
4765         switch (status->float_rounding_mode) {
4766          case float_round_nearest_even:
4767             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4768                ) {
4769                 return
4770                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4771             }
4772             break;
4773         case float_round_ties_away:
4774             if (aExp == 0x3FFE) {
4775                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4776             }
4777             break;
4778          case float_round_down:
4779             return
4780                   aSign ?
4781                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4782                 : packFloatx80( 0, 0, 0 );
4783          case float_round_up:
4784             return
4785                   aSign ? packFloatx80( 1, 0, 0 )
4786                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4787         }
4788         return packFloatx80( aSign, 0, 0 );
4789     }
4790     lastBitMask = 1;
4791     lastBitMask <<= 0x403E - aExp;
4792     roundBitsMask = lastBitMask - 1;
4793     z = a;
4794     switch (status->float_rounding_mode) {
4795     case float_round_nearest_even:
4796         z.low += lastBitMask>>1;
4797         if ((z.low & roundBitsMask) == 0) {
4798             z.low &= ~lastBitMask;
4799         }
4800         break;
4801     case float_round_ties_away:
4802         z.low += lastBitMask >> 1;
4803         break;
4804     case float_round_to_zero:
4805         break;
4806     case float_round_up:
4807         if (!extractFloatx80Sign(z)) {
4808             z.low += roundBitsMask;
4809         }
4810         break;
4811     case float_round_down:
4812         if (extractFloatx80Sign(z)) {
4813             z.low += roundBitsMask;
4814         }
4815         break;
4816     default:
4817         abort();
4818     }
4819     z.low &= ~ roundBitsMask;
4820     if ( z.low == 0 ) {
4821         ++z.high;
4822         z.low = LIT64( 0x8000000000000000 );
4823     }
4824     if (z.low != a.low) {
4825         status->float_exception_flags |= float_flag_inexact;
4826     }
4827     return z;
4828
4829 }
4830
4831 /*----------------------------------------------------------------------------
4832 | Returns the result of adding the absolute values of the extended double-
4833 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4834 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4835 | The addition is performed according to the IEC/IEEE Standard for Binary
4836 | Floating-Point Arithmetic.
4837 *----------------------------------------------------------------------------*/
4838
4839 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4840                                 float_status *status)
4841 {
4842     int32_t aExp, bExp, zExp;
4843     uint64_t aSig, bSig, zSig0, zSig1;
4844     int32_t expDiff;
4845
4846     aSig = extractFloatx80Frac( a );
4847     aExp = extractFloatx80Exp( a );
4848     bSig = extractFloatx80Frac( b );
4849     bExp = extractFloatx80Exp( b );
4850     expDiff = aExp - bExp;
4851     if ( 0 < expDiff ) {
4852         if ( aExp == 0x7FFF ) {
4853             if ((uint64_t)(aSig << 1)) {
4854                 return propagateFloatx80NaN(a, b, status);
4855             }
4856             return a;
4857         }
4858         if ( bExp == 0 ) --expDiff;
4859         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4860         zExp = aExp;
4861     }
4862     else if ( expDiff < 0 ) {
4863         if ( bExp == 0x7FFF ) {
4864             if ((uint64_t)(bSig << 1)) {
4865                 return propagateFloatx80NaN(a, b, status);
4866             }
4867             return packFloatx80(zSign,
4868                                 floatx80_infinity_high,
4869                                 floatx80_infinity_low);
4870         }
4871         if ( aExp == 0 ) ++expDiff;
4872         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4873         zExp = bExp;
4874     }
4875     else {
4876         if ( aExp == 0x7FFF ) {
4877             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4878                 return propagateFloatx80NaN(a, b, status);
4879             }
4880             return a;
4881         }
4882         zSig1 = 0;
4883         zSig0 = aSig + bSig;
4884         if ( aExp == 0 ) {
4885             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4886             goto roundAndPack;
4887         }
4888         zExp = aExp;
4889         goto shiftRight1;
4890     }
4891     zSig0 = aSig + bSig;
4892     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4893  shiftRight1:
4894     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4895     zSig0 |= LIT64( 0x8000000000000000 );
4896     ++zExp;
4897  roundAndPack:
4898     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4899                                 zSign, zExp, zSig0, zSig1, status);
4900 }
4901
4902 /*----------------------------------------------------------------------------
4903 | Returns the result of subtracting the absolute values of the extended
4904 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4905 | difference is negated before being returned.  `zSign' is ignored if the
4906 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4907 | Standard for Binary Floating-Point Arithmetic.
4908 *----------------------------------------------------------------------------*/
4909
4910 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4911                                 float_status *status)
4912 {
4913     int32_t aExp, bExp, zExp;
4914     uint64_t aSig, bSig, zSig0, zSig1;
4915     int32_t expDiff;
4916
4917     aSig = extractFloatx80Frac( a );
4918     aExp = extractFloatx80Exp( a );
4919     bSig = extractFloatx80Frac( b );
4920     bExp = extractFloatx80Exp( b );
4921     expDiff = aExp - bExp;
4922     if ( 0 < expDiff ) goto aExpBigger;
4923     if ( expDiff < 0 ) goto bExpBigger;
4924     if ( aExp == 0x7FFF ) {
4925         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4926             return propagateFloatx80NaN(a, b, status);
4927         }
4928         float_raise(float_flag_invalid, status);
4929         return floatx80_default_nan(status);
4930     }
4931     if ( aExp == 0 ) {
4932         aExp = 1;
4933         bExp = 1;
4934     }
4935     zSig1 = 0;
4936     if ( bSig < aSig ) goto aBigger;
4937     if ( aSig < bSig ) goto bBigger;
4938     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4939  bExpBigger:
4940     if ( bExp == 0x7FFF ) {
4941         if ((uint64_t)(bSig << 1)) {
4942             return propagateFloatx80NaN(a, b, status);
4943         }
4944         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4945                             floatx80_infinity_low);
4946     }
4947     if ( aExp == 0 ) ++expDiff;
4948     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4949  bBigger:
4950     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4951     zExp = bExp;
4952     zSign ^= 1;
4953     goto normalizeRoundAndPack;
4954  aExpBigger:
4955     if ( aExp == 0x7FFF ) {
4956         if ((uint64_t)(aSig << 1)) {
4957             return propagateFloatx80NaN(a, b, status);
4958         }
4959         return a;
4960     }
4961     if ( bExp == 0 ) --expDiff;
4962     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4963  aBigger:
4964     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4965     zExp = aExp;
4966  normalizeRoundAndPack:
4967     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4968                                          zSign, zExp, zSig0, zSig1, status);
4969 }
4970
4971 /*----------------------------------------------------------------------------
4972 | Returns the result of adding the extended double-precision floating-point
4973 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4974 | Standard for Binary Floating-Point Arithmetic.
4975 *----------------------------------------------------------------------------*/
4976
4977 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4978 {
4979     flag aSign, bSign;
4980
4981     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4982         float_raise(float_flag_invalid, status);
4983         return floatx80_default_nan(status);
4984     }
4985     aSign = extractFloatx80Sign( a );
4986     bSign = extractFloatx80Sign( b );
4987     if ( aSign == bSign ) {
4988         return addFloatx80Sigs(a, b, aSign, status);
4989     }
4990     else {
4991         return subFloatx80Sigs(a, b, aSign, status);
4992     }
4993
4994 }
4995
4996 /*----------------------------------------------------------------------------
4997 | Returns the result of subtracting the extended double-precision floating-
4998 | point values `a' and `b'.  The operation is performed according to the
4999 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5000 *----------------------------------------------------------------------------*/
5001
5002 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5003 {
5004     flag aSign, bSign;
5005
5006     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5007         float_raise(float_flag_invalid, status);
5008         return floatx80_default_nan(status);
5009     }
5010     aSign = extractFloatx80Sign( a );
5011     bSign = extractFloatx80Sign( b );
5012     if ( aSign == bSign ) {
5013         return subFloatx80Sigs(a, b, aSign, status);
5014     }
5015     else {
5016         return addFloatx80Sigs(a, b, aSign, status);
5017     }
5018
5019 }
5020
5021 /*----------------------------------------------------------------------------
5022 | Returns the result of multiplying the extended double-precision floating-
5023 | point values `a' and `b'.  The operation is performed according to the
5024 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5025 *----------------------------------------------------------------------------*/
5026
5027 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5028 {
5029     flag aSign, bSign, zSign;
5030     int32_t aExp, bExp, zExp;
5031     uint64_t aSig, bSig, zSig0, zSig1;
5032
5033     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5034         float_raise(float_flag_invalid, status);
5035         return floatx80_default_nan(status);
5036     }
5037     aSig = extractFloatx80Frac( a );
5038     aExp = extractFloatx80Exp( a );
5039     aSign = extractFloatx80Sign( a );
5040     bSig = extractFloatx80Frac( b );
5041     bExp = extractFloatx80Exp( b );
5042     bSign = extractFloatx80Sign( b );
5043     zSign = aSign ^ bSign;
5044     if ( aExp == 0x7FFF ) {
5045         if (    (uint64_t) ( aSig<<1 )
5046              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5047             return propagateFloatx80NaN(a, b, status);
5048         }
5049         if ( ( bExp | bSig ) == 0 ) goto invalid;
5050         return packFloatx80(zSign, floatx80_infinity_high,
5051                                    floatx80_infinity_low);
5052     }
5053     if ( bExp == 0x7FFF ) {
5054         if ((uint64_t)(bSig << 1)) {
5055             return propagateFloatx80NaN(a, b, status);
5056         }
5057         if ( ( aExp | aSig ) == 0 ) {
5058  invalid:
5059             float_raise(float_flag_invalid, status);
5060             return floatx80_default_nan(status);
5061         }
5062         return packFloatx80(zSign, floatx80_infinity_high,
5063                                    floatx80_infinity_low);
5064     }
5065     if ( aExp == 0 ) {
5066         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5067         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5068     }
5069     if ( bExp == 0 ) {
5070         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5071         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5072     }
5073     zExp = aExp + bExp - 0x3FFE;
5074     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5075     if ( 0 < (int64_t) zSig0 ) {
5076         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5077         --zExp;
5078     }
5079     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5080                                 zSign, zExp, zSig0, zSig1, status);
5081 }
5082
5083 /*----------------------------------------------------------------------------
5084 | Returns the result of dividing the extended double-precision floating-point
5085 | value `a' by the corresponding value `b'.  The operation is performed
5086 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5087 *----------------------------------------------------------------------------*/
5088
5089 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5090 {
5091     flag aSign, bSign, zSign;
5092     int32_t aExp, bExp, zExp;
5093     uint64_t aSig, bSig, zSig0, zSig1;
5094     uint64_t rem0, rem1, rem2, term0, term1, term2;
5095
5096     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5097         float_raise(float_flag_invalid, status);
5098         return floatx80_default_nan(status);
5099     }
5100     aSig = extractFloatx80Frac( a );
5101     aExp = extractFloatx80Exp( a );
5102     aSign = extractFloatx80Sign( a );
5103     bSig = extractFloatx80Frac( b );
5104     bExp = extractFloatx80Exp( b );
5105     bSign = extractFloatx80Sign( b );
5106     zSign = aSign ^ bSign;
5107     if ( aExp == 0x7FFF ) {
5108         if ((uint64_t)(aSig << 1)) {
5109             return propagateFloatx80NaN(a, b, status);
5110         }
5111         if ( bExp == 0x7FFF ) {
5112             if ((uint64_t)(bSig << 1)) {
5113                 return propagateFloatx80NaN(a, b, status);
5114             }
5115             goto invalid;
5116         }
5117         return packFloatx80(zSign, floatx80_infinity_high,
5118                                    floatx80_infinity_low);
5119     }
5120     if ( bExp == 0x7FFF ) {
5121         if ((uint64_t)(bSig << 1)) {
5122             return propagateFloatx80NaN(a, b, status);
5123         }
5124         return packFloatx80( zSign, 0, 0 );
5125     }
5126     if ( bExp == 0 ) {
5127         if ( bSig == 0 ) {
5128             if ( ( aExp | aSig ) == 0 ) {
5129  invalid:
5130                 float_raise(float_flag_invalid, status);
5131                 return floatx80_default_nan(status);
5132             }
5133             float_raise(float_flag_divbyzero, status);
5134             return packFloatx80(zSign, floatx80_infinity_high,
5135                                        floatx80_infinity_low);
5136         }
5137         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5138     }
5139     if ( aExp == 0 ) {
5140         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5141         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5142     }
5143     zExp = aExp - bExp + 0x3FFE;
5144     rem1 = 0;
5145     if ( bSig <= aSig ) {
5146         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5147         ++zExp;
5148     }
5149     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5150     mul64To128( bSig, zSig0, &term0, &term1 );
5151     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5152     while ( (int64_t) rem0 < 0 ) {
5153         --zSig0;
5154         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5155     }
5156     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5157     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5158         mul64To128( bSig, zSig1, &term1, &term2 );
5159         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5160         while ( (int64_t) rem1 < 0 ) {
5161             --zSig1;
5162             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5163         }
5164         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5165     }
5166     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5167                                 zSign, zExp, zSig0, zSig1, status);
5168 }
5169
5170 /*----------------------------------------------------------------------------
5171 | Returns the remainder of the extended double-precision floating-point value
5172 | `a' with respect to the corresponding value `b'.  The operation is performed
5173 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5174 *----------------------------------------------------------------------------*/
5175
5176 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5177 {
5178     flag aSign, zSign;
5179     int32_t aExp, bExp, expDiff;
5180     uint64_t aSig0, aSig1, bSig;
5181     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5182
5183     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5184         float_raise(float_flag_invalid, status);
5185         return floatx80_default_nan(status);
5186     }
5187     aSig0 = extractFloatx80Frac( a );
5188     aExp = extractFloatx80Exp( a );
5189     aSign = extractFloatx80Sign( a );
5190     bSig = extractFloatx80Frac( b );
5191     bExp = extractFloatx80Exp( b );
5192     if ( aExp == 0x7FFF ) {
5193         if (    (uint64_t) ( aSig0<<1 )
5194              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5195             return propagateFloatx80NaN(a, b, status);
5196         }
5197         goto invalid;
5198     }
5199     if ( bExp == 0x7FFF ) {
5200         if ((uint64_t)(bSig << 1)) {
5201             return propagateFloatx80NaN(a, b, status);
5202         }
5203         return a;
5204     }
5205     if ( bExp == 0 ) {
5206         if ( bSig == 0 ) {
5207  invalid:
5208             float_raise(float_flag_invalid, status);
5209             return floatx80_default_nan(status);
5210         }
5211         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5212     }
5213     if ( aExp == 0 ) {
5214         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5215         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5216     }
5217     bSig |= LIT64( 0x8000000000000000 );
5218     zSign = aSign;
5219     expDiff = aExp - bExp;
5220     aSig1 = 0;
5221     if ( expDiff < 0 ) {
5222         if ( expDiff < -1 ) return a;
5223         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5224         expDiff = 0;
5225     }
5226     q = ( bSig <= aSig0 );
5227     if ( q ) aSig0 -= bSig;
5228     expDiff -= 64;
5229     while ( 0 < expDiff ) {
5230         q = estimateDiv128To64( aSig0, aSig1, bSig );
5231         q = ( 2 < q ) ? q - 2 : 0;
5232         mul64To128( bSig, q, &term0, &term1 );
5233         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5234         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5235         expDiff -= 62;
5236     }
5237     expDiff += 64;
5238     if ( 0 < expDiff ) {
5239         q = estimateDiv128To64( aSig0, aSig1, bSig );
5240         q = ( 2 < q ) ? q - 2 : 0;
5241         q >>= 64 - expDiff;
5242         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5243         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5244         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5245         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5246             ++q;
5247             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5248         }
5249     }
5250     else {
5251         term1 = 0;
5252         term0 = bSig;
5253     }
5254     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5255     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5256          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5257               && ( q & 1 ) )
5258        ) {
5259         aSig0 = alternateASig0;
5260         aSig1 = alternateASig1;
5261         zSign = ! zSign;
5262     }
5263     return
5264         normalizeRoundAndPackFloatx80(
5265             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5266
5267 }
5268
5269 /*----------------------------------------------------------------------------
5270 | Returns the square root of the extended double-precision floating-point
5271 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5272 | for Binary Floating-Point Arithmetic.
5273 *----------------------------------------------------------------------------*/
5274
5275 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5276 {
5277     flag aSign;
5278     int32_t aExp, zExp;
5279     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5280     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5281
5282     if (floatx80_invalid_encoding(a)) {
5283         float_raise(float_flag_invalid, status);
5284         return floatx80_default_nan(status);
5285     }
5286     aSig0 = extractFloatx80Frac( a );
5287     aExp = extractFloatx80Exp( a );
5288     aSign = extractFloatx80Sign( a );
5289     if ( aExp == 0x7FFF ) {
5290         if ((uint64_t)(aSig0 << 1)) {
5291             return propagateFloatx80NaN(a, a, status);
5292         }
5293         if ( ! aSign ) return a;
5294         goto invalid;
5295     }
5296     if ( aSign ) {
5297         if ( ( aExp | aSig0 ) == 0 ) return a;
5298  invalid:
5299         float_raise(float_flag_invalid, status);
5300         return floatx80_default_nan(status);
5301     }
5302     if ( aExp == 0 ) {
5303         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5304         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5305     }
5306     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5307     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5308     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5309     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5310     doubleZSig0 = zSig0<<1;
5311     mul64To128( zSig0, zSig0, &term0, &term1 );
5312     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5313     while ( (int64_t) rem0 < 0 ) {
5314         --zSig0;
5315         doubleZSig0 -= 2;
5316         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5317     }
5318     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5319     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5320         if ( zSig1 == 0 ) zSig1 = 1;
5321         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5322         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5323         mul64To128( zSig1, zSig1, &term2, &term3 );
5324         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5325         while ( (int64_t) rem1 < 0 ) {
5326             --zSig1;
5327             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5328             term3 |= 1;
5329             term2 |= doubleZSig0;
5330             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5331         }
5332         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5333     }
5334     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5335     zSig0 |= doubleZSig0;
5336     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5337                                 0, zExp, zSig0, zSig1, status);
5338 }
5339
5340 /*----------------------------------------------------------------------------
5341 | Returns 1 if the extended double-precision floating-point value `a' is equal
5342 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5343 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5344 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5345 *----------------------------------------------------------------------------*/
5346
5347 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5348 {
5349
5350     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5351         || (extractFloatx80Exp(a) == 0x7FFF
5352             && (uint64_t) (extractFloatx80Frac(a) << 1))
5353         || (extractFloatx80Exp(b) == 0x7FFF
5354             && (uint64_t) (extractFloatx80Frac(b) << 1))
5355        ) {
5356         float_raise(float_flag_invalid, status);
5357         return 0;
5358     }
5359     return
5360            ( a.low == b.low )
5361         && (    ( a.high == b.high )
5362              || (    ( a.low == 0 )
5363                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5364            );
5365
5366 }
5367
5368 /*----------------------------------------------------------------------------
5369 | Returns 1 if the extended double-precision floating-point value `a' is
5370 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5371 | invalid exception is raised if either operand is a NaN.  The comparison is
5372 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5373 | Arithmetic.
5374 *----------------------------------------------------------------------------*/
5375
5376 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5377 {
5378     flag aSign, bSign;
5379
5380     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5381         || (extractFloatx80Exp(a) == 0x7FFF
5382             && (uint64_t) (extractFloatx80Frac(a) << 1))
5383         || (extractFloatx80Exp(b) == 0x7FFF
5384             && (uint64_t) (extractFloatx80Frac(b) << 1))
5385        ) {
5386         float_raise(float_flag_invalid, status);
5387         return 0;
5388     }
5389     aSign = extractFloatx80Sign( a );
5390     bSign = extractFloatx80Sign( b );
5391     if ( aSign != bSign ) {
5392         return
5393                aSign
5394             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5395                  == 0 );
5396     }
5397     return
5398           aSign ? le128( b.high, b.low, a.high, a.low )
5399         : le128( a.high, a.low, b.high, b.low );
5400
5401 }
5402
5403 /*----------------------------------------------------------------------------
5404 | Returns 1 if the extended double-precision floating-point value `a' is
5405 | less than the corresponding value `b', and 0 otherwise.  The invalid
5406 | exception is raised if either operand is a NaN.  The comparison is performed
5407 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5408 *----------------------------------------------------------------------------*/
5409
5410 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5411 {
5412     flag aSign, bSign;
5413
5414     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5415         || (extractFloatx80Exp(a) == 0x7FFF
5416             && (uint64_t) (extractFloatx80Frac(a) << 1))
5417         || (extractFloatx80Exp(b) == 0x7FFF
5418             && (uint64_t) (extractFloatx80Frac(b) << 1))
5419        ) {
5420         float_raise(float_flag_invalid, status);
5421         return 0;
5422     }
5423     aSign = extractFloatx80Sign( a );
5424     bSign = extractFloatx80Sign( b );
5425     if ( aSign != bSign ) {
5426         return
5427                aSign
5428             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5429                  != 0 );
5430     }
5431     return
5432           aSign ? lt128( b.high, b.low, a.high, a.low )
5433         : lt128( a.high, a.low, b.high, b.low );
5434
5435 }
5436
5437 /*----------------------------------------------------------------------------
5438 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5439 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5440 | either operand is a NaN.   The comparison is performed according to the
5441 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5442 *----------------------------------------------------------------------------*/
5443 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5444 {
5445     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5446         || (extractFloatx80Exp(a) == 0x7FFF
5447             && (uint64_t) (extractFloatx80Frac(a) << 1))
5448         || (extractFloatx80Exp(b) == 0x7FFF
5449             && (uint64_t) (extractFloatx80Frac(b) << 1))
5450        ) {
5451         float_raise(float_flag_invalid, status);
5452         return 1;
5453     }
5454     return 0;
5455 }
5456
5457 /*----------------------------------------------------------------------------
5458 | Returns 1 if the extended double-precision floating-point value `a' is
5459 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5460 | cause an exception.  The comparison is performed according to the IEC/IEEE
5461 | Standard for Binary Floating-Point Arithmetic.
5462 *----------------------------------------------------------------------------*/
5463
5464 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5465 {
5466
5467     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5468         float_raise(float_flag_invalid, status);
5469         return 0;
5470     }
5471     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5472               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5473          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5474               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5475        ) {
5476         if (floatx80_is_signaling_nan(a, status)
5477          || floatx80_is_signaling_nan(b, status)) {
5478             float_raise(float_flag_invalid, status);
5479         }
5480         return 0;
5481     }
5482     return
5483            ( a.low == b.low )
5484         && (    ( a.high == b.high )
5485              || (    ( a.low == 0 )
5486                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5487            );
5488
5489 }
5490
5491 /*----------------------------------------------------------------------------
5492 | Returns 1 if the extended double-precision floating-point value `a' is less
5493 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5494 | do not cause an exception.  Otherwise, the comparison is performed according
5495 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5496 *----------------------------------------------------------------------------*/
5497
5498 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5499 {
5500     flag aSign, bSign;
5501
5502     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5503         float_raise(float_flag_invalid, status);
5504         return 0;
5505     }
5506     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5507               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5508          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5509               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5510        ) {
5511         if (floatx80_is_signaling_nan(a, status)
5512          || floatx80_is_signaling_nan(b, status)) {
5513             float_raise(float_flag_invalid, status);
5514         }
5515         return 0;
5516     }
5517     aSign = extractFloatx80Sign( a );
5518     bSign = extractFloatx80Sign( b );
5519     if ( aSign != bSign ) {
5520         return
5521                aSign
5522             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5523                  == 0 );
5524     }
5525     return
5526           aSign ? le128( b.high, b.low, a.high, a.low )
5527         : le128( a.high, a.low, b.high, b.low );
5528
5529 }
5530
5531 /*----------------------------------------------------------------------------
5532 | Returns 1 if the extended double-precision floating-point value `a' is less
5533 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5534 | an exception.  Otherwise, the comparison is performed according to the
5535 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5536 *----------------------------------------------------------------------------*/
5537
5538 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5539 {
5540     flag aSign, bSign;
5541
5542     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5543         float_raise(float_flag_invalid, status);
5544         return 0;
5545     }
5546     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5547               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5548          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5549               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5550        ) {
5551         if (floatx80_is_signaling_nan(a, status)
5552          || floatx80_is_signaling_nan(b, status)) {
5553             float_raise(float_flag_invalid, status);
5554         }
5555         return 0;
5556     }
5557     aSign = extractFloatx80Sign( a );
5558     bSign = extractFloatx80Sign( b );
5559     if ( aSign != bSign ) {
5560         return
5561                aSign
5562             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5563                  != 0 );
5564     }
5565     return
5566           aSign ? lt128( b.high, b.low, a.high, a.low )
5567         : lt128( a.high, a.low, b.high, b.low );
5568
5569 }
5570
5571 /*----------------------------------------------------------------------------
5572 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5573 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5574 | The comparison is performed according to the IEC/IEEE Standard for Binary
5575 | Floating-Point Arithmetic.
5576 *----------------------------------------------------------------------------*/
5577 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5578 {
5579     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5580         float_raise(float_flag_invalid, status);
5581         return 1;
5582     }
5583     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5584               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5585          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5586               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5587        ) {
5588         if (floatx80_is_signaling_nan(a, status)
5589          || floatx80_is_signaling_nan(b, status)) {
5590             float_raise(float_flag_invalid, status);
5591         }
5592         return 1;
5593     }
5594     return 0;
5595 }
5596
5597 /*----------------------------------------------------------------------------
5598 | Returns the result of converting the quadruple-precision floating-point
5599 | value `a' to the 32-bit two's complement integer format.  The conversion
5600 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5601 | Arithmetic---which means in particular that the conversion is rounded
5602 | according to the current rounding mode.  If `a' is a NaN, the largest
5603 | positive integer is returned.  Otherwise, if the conversion overflows, the
5604 | largest integer with the same sign as `a' is returned.
5605 *----------------------------------------------------------------------------*/
5606
5607 int32_t float128_to_int32(float128 a, float_status *status)
5608 {
5609     flag aSign;
5610     int32_t aExp, shiftCount;
5611     uint64_t aSig0, aSig1;
5612
5613     aSig1 = extractFloat128Frac1( a );
5614     aSig0 = extractFloat128Frac0( a );
5615     aExp = extractFloat128Exp( a );
5616     aSign = extractFloat128Sign( a );
5617     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5618     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5619     aSig0 |= ( aSig1 != 0 );
5620     shiftCount = 0x4028 - aExp;
5621     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5622     return roundAndPackInt32(aSign, aSig0, status);
5623
5624 }
5625
5626 /*----------------------------------------------------------------------------
5627 | Returns the result of converting the quadruple-precision floating-point
5628 | value `a' to the 32-bit two's complement integer format.  The conversion
5629 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5630 | Arithmetic, except that the conversion is always rounded toward zero.  If
5631 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5632 | conversion overflows, the largest integer with the same sign as `a' is
5633 | returned.
5634 *----------------------------------------------------------------------------*/
5635
5636 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5637 {
5638     flag aSign;
5639     int32_t aExp, shiftCount;
5640     uint64_t aSig0, aSig1, savedASig;
5641     int32_t z;
5642
5643     aSig1 = extractFloat128Frac1( a );
5644     aSig0 = extractFloat128Frac0( a );
5645     aExp = extractFloat128Exp( a );
5646     aSign = extractFloat128Sign( a );
5647     aSig0 |= ( aSig1 != 0 );
5648     if ( 0x401E < aExp ) {
5649         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5650         goto invalid;
5651     }
5652     else if ( aExp < 0x3FFF ) {
5653         if (aExp || aSig0) {
5654             status->float_exception_flags |= float_flag_inexact;
5655         }
5656         return 0;
5657     }
5658     aSig0 |= LIT64( 0x0001000000000000 );
5659     shiftCount = 0x402F - aExp;
5660     savedASig = aSig0;
5661     aSig0 >>= shiftCount;
5662     z = aSig0;
5663     if ( aSign ) z = - z;
5664     if ( ( z < 0 ) ^ aSign ) {
5665  invalid:
5666         float_raise(float_flag_invalid, status);
5667         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5668     }
5669     if ( ( aSig0<<shiftCount ) != savedASig ) {
5670         status->float_exception_flags |= float_flag_inexact;
5671     }
5672     return z;
5673
5674 }
5675
5676 /*----------------------------------------------------------------------------
5677 | Returns the result of converting the quadruple-precision floating-point
5678 | value `a' to the 64-bit two's complement integer format.  The conversion
5679 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5680 | Arithmetic---which means in particular that the conversion is rounded
5681 | according to the current rounding mode.  If `a' is a NaN, the largest
5682 | positive integer is returned.  Otherwise, if the conversion overflows, the
5683 | largest integer with the same sign as `a' is returned.
5684 *----------------------------------------------------------------------------*/
5685
5686 int64_t float128_to_int64(float128 a, float_status *status)
5687 {
5688     flag aSign;
5689     int32_t aExp, shiftCount;
5690     uint64_t aSig0, aSig1;
5691
5692     aSig1 = extractFloat128Frac1( a );
5693     aSig0 = extractFloat128Frac0( a );
5694     aExp = extractFloat128Exp( a );
5695     aSign = extractFloat128Sign( a );
5696     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5697     shiftCount = 0x402F - aExp;
5698     if ( shiftCount <= 0 ) {
5699         if ( 0x403E < aExp ) {
5700             float_raise(float_flag_invalid, status);
5701             if (    ! aSign
5702                  || (    ( aExp == 0x7FFF )
5703                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5704                     )
5705                ) {
5706                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5707             }
5708             return (int64_t) LIT64( 0x8000000000000000 );
5709         }
5710         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5711     }
5712     else {
5713         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5714     }
5715     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5716
5717 }
5718
5719 /*----------------------------------------------------------------------------
5720 | Returns the result of converting the quadruple-precision floating-point
5721 | value `a' to the 64-bit two's complement integer format.  The conversion
5722 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5723 | Arithmetic, except that the conversion is always rounded toward zero.
5724 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5725 | the conversion overflows, the largest integer with the same sign as `a' is
5726 | returned.
5727 *----------------------------------------------------------------------------*/
5728
5729 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5730 {
5731     flag aSign;
5732     int32_t aExp, shiftCount;
5733     uint64_t aSig0, aSig1;
5734     int64_t z;
5735
5736     aSig1 = extractFloat128Frac1( a );
5737     aSig0 = extractFloat128Frac0( a );
5738     aExp = extractFloat128Exp( a );
5739     aSign = extractFloat128Sign( a );
5740     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5741     shiftCount = aExp - 0x402F;
5742     if ( 0 < shiftCount ) {
5743         if ( 0x403E <= aExp ) {
5744             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5745             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5746                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5747                 if (aSig1) {
5748                     status->float_exception_flags |= float_flag_inexact;
5749                 }
5750             }
5751             else {
5752                 float_raise(float_flag_invalid, status);
5753                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5754                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5755                 }
5756             }
5757             return (int64_t) LIT64( 0x8000000000000000 );
5758         }
5759         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5760         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5761             status->float_exception_flags |= float_flag_inexact;
5762         }
5763     }
5764     else {
5765         if ( aExp < 0x3FFF ) {
5766             if ( aExp | aSig0 | aSig1 ) {
5767                 status->float_exception_flags |= float_flag_inexact;
5768             }
5769             return 0;
5770         }
5771         z = aSig0>>( - shiftCount );
5772         if (    aSig1
5773              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5774             status->float_exception_flags |= float_flag_inexact;
5775         }
5776     }
5777     if ( aSign ) z = - z;
5778     return z;
5779
5780 }
5781
5782 /*----------------------------------------------------------------------------
5783 | Returns the result of converting the quadruple-precision floating-point value
5784 | `a' to the 64-bit unsigned integer format.  The conversion is
5785 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5786 | Arithmetic---which means in particular that the conversion is rounded
5787 | according to the current rounding mode.  If `a' is a NaN, the largest
5788 | positive integer is returned.  If the conversion overflows, the
5789 | largest unsigned integer is returned.  If 'a' is negative, the value is
5790 | rounded and zero is returned; negative values that do not round to zero
5791 | will raise the inexact exception.
5792 *----------------------------------------------------------------------------*/
5793
5794 uint64_t float128_to_uint64(float128 a, float_status *status)
5795 {
5796     flag aSign;
5797     int aExp;
5798     int shiftCount;
5799     uint64_t aSig0, aSig1;
5800
5801     aSig0 = extractFloat128Frac0(a);
5802     aSig1 = extractFloat128Frac1(a);
5803     aExp = extractFloat128Exp(a);
5804     aSign = extractFloat128Sign(a);
5805     if (aSign && (aExp > 0x3FFE)) {
5806         float_raise(float_flag_invalid, status);
5807         if (float128_is_any_nan(a)) {
5808             return LIT64(0xFFFFFFFFFFFFFFFF);
5809         } else {
5810             return 0;
5811         }
5812     }
5813     if (aExp) {
5814         aSig0 |= LIT64(0x0001000000000000);
5815     }
5816     shiftCount = 0x402F - aExp;
5817     if (shiftCount <= 0) {
5818         if (0x403E < aExp) {
5819             float_raise(float_flag_invalid, status);
5820             return LIT64(0xFFFFFFFFFFFFFFFF);
5821         }
5822         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5823     } else {
5824         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5825     }
5826     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5827 }
5828
5829 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5830 {
5831     uint64_t v;
5832     signed char current_rounding_mode = status->float_rounding_mode;
5833
5834     set_float_rounding_mode(float_round_to_zero, status);
5835     v = float128_to_uint64(a, status);
5836     set_float_rounding_mode(current_rounding_mode, status);
5837
5838     return v;
5839 }
5840
5841 /*----------------------------------------------------------------------------
5842 | Returns the result of converting the quadruple-precision floating-point
5843 | value `a' to the 32-bit unsigned integer format.  The conversion
5844 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5845 | Arithmetic except that the conversion is always rounded toward zero.
5846 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5847 | if the conversion overflows, the largest unsigned integer is returned.
5848 | If 'a' is negative, the value is rounded and zero is returned; negative
5849 | values that do not round to zero will raise the inexact exception.
5850 *----------------------------------------------------------------------------*/
5851
5852 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5853 {
5854     uint64_t v;
5855     uint32_t res;
5856     int old_exc_flags = get_float_exception_flags(status);
5857
5858     v = float128_to_uint64_round_to_zero(a, status);
5859     if (v > 0xffffffff) {
5860         res = 0xffffffff;
5861     } else {
5862         return v;
5863     }
5864     set_float_exception_flags(old_exc_flags, status);
5865     float_raise(float_flag_invalid, status);
5866     return res;
5867 }
5868
5869 /*----------------------------------------------------------------------------
5870 | Returns the result of converting the quadruple-precision floating-point
5871 | value `a' to the single-precision floating-point format.  The conversion
5872 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5873 | Arithmetic.
5874 *----------------------------------------------------------------------------*/
5875
5876 float32 float128_to_float32(float128 a, float_status *status)
5877 {
5878     flag aSign;
5879     int32_t aExp;
5880     uint64_t aSig0, aSig1;
5881     uint32_t zSig;
5882
5883     aSig1 = extractFloat128Frac1( a );
5884     aSig0 = extractFloat128Frac0( a );
5885     aExp = extractFloat128Exp( a );
5886     aSign = extractFloat128Sign( a );
5887     if ( aExp == 0x7FFF ) {
5888         if ( aSig0 | aSig1 ) {
5889             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5890         }
5891         return packFloat32( aSign, 0xFF, 0 );
5892     }
5893     aSig0 |= ( aSig1 != 0 );
5894     shift64RightJamming( aSig0, 18, &aSig0 );
5895     zSig = aSig0;
5896     if ( aExp || zSig ) {
5897         zSig |= 0x40000000;
5898         aExp -= 0x3F81;
5899     }
5900     return roundAndPackFloat32(aSign, aExp, zSig, status);
5901
5902 }
5903
5904 /*----------------------------------------------------------------------------
5905 | Returns the result of converting the quadruple-precision floating-point
5906 | value `a' to the double-precision floating-point format.  The conversion
5907 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5908 | Arithmetic.
5909 *----------------------------------------------------------------------------*/
5910
5911 float64 float128_to_float64(float128 a, float_status *status)
5912 {
5913     flag aSign;
5914     int32_t aExp;
5915     uint64_t aSig0, aSig1;
5916
5917     aSig1 = extractFloat128Frac1( a );
5918     aSig0 = extractFloat128Frac0( a );
5919     aExp = extractFloat128Exp( a );
5920     aSign = extractFloat128Sign( a );
5921     if ( aExp == 0x7FFF ) {
5922         if ( aSig0 | aSig1 ) {
5923             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5924         }
5925         return packFloat64( aSign, 0x7FF, 0 );
5926     }
5927     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5928     aSig0 |= ( aSig1 != 0 );
5929     if ( aExp || aSig0 ) {
5930         aSig0 |= LIT64( 0x4000000000000000 );
5931         aExp -= 0x3C01;
5932     }
5933     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5934
5935 }
5936
5937 /*----------------------------------------------------------------------------
5938 | Returns the result of converting the quadruple-precision floating-point
5939 | value `a' to the extended double-precision floating-point format.  The
5940 | conversion is performed according to the IEC/IEEE Standard for Binary
5941 | Floating-Point Arithmetic.
5942 *----------------------------------------------------------------------------*/
5943
5944 floatx80 float128_to_floatx80(float128 a, float_status *status)
5945 {
5946     flag aSign;
5947     int32_t aExp;
5948     uint64_t aSig0, aSig1;
5949
5950     aSig1 = extractFloat128Frac1( a );
5951     aSig0 = extractFloat128Frac0( a );
5952     aExp = extractFloat128Exp( a );
5953     aSign = extractFloat128Sign( a );
5954     if ( aExp == 0x7FFF ) {
5955         if ( aSig0 | aSig1 ) {
5956             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5957         }
5958         return packFloatx80(aSign, floatx80_infinity_high,
5959                                    floatx80_infinity_low);
5960     }
5961     if ( aExp == 0 ) {
5962         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5963         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5964     }
5965     else {
5966         aSig0 |= LIT64( 0x0001000000000000 );
5967     }
5968     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5969     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5970
5971 }
5972
5973 /*----------------------------------------------------------------------------
5974 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5975 | returns the result as a quadruple-precision floating-point value.  The
5976 | operation is performed according to the IEC/IEEE Standard for Binary
5977 | Floating-Point Arithmetic.
5978 *----------------------------------------------------------------------------*/
5979
5980 float128 float128_round_to_int(float128 a, float_status *status)
5981 {
5982     flag aSign;
5983     int32_t aExp;
5984     uint64_t lastBitMask, roundBitsMask;
5985     float128 z;
5986
5987     aExp = extractFloat128Exp( a );
5988     if ( 0x402F <= aExp ) {
5989         if ( 0x406F <= aExp ) {
5990             if (    ( aExp == 0x7FFF )
5991                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5992                ) {
5993                 return propagateFloat128NaN(a, a, status);
5994             }
5995             return a;
5996         }
5997         lastBitMask = 1;
5998         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5999         roundBitsMask = lastBitMask - 1;
6000         z = a;
6001         switch (status->float_rounding_mode) {
6002         case float_round_nearest_even:
6003             if ( lastBitMask ) {
6004                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6005                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6006             }
6007             else {
6008                 if ( (int64_t) z.low < 0 ) {
6009                     ++z.high;
6010                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6011                 }
6012             }
6013             break;
6014         case float_round_ties_away:
6015             if (lastBitMask) {
6016                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6017             } else {
6018                 if ((int64_t) z.low < 0) {
6019                     ++z.high;
6020                 }
6021             }
6022             break;
6023         case float_round_to_zero:
6024             break;
6025         case float_round_up:
6026             if (!extractFloat128Sign(z)) {
6027                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6028             }
6029             break;
6030         case float_round_down:
6031             if (extractFloat128Sign(z)) {
6032                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6033             }
6034             break;
6035         default:
6036             abort();
6037         }
6038         z.low &= ~ roundBitsMask;
6039     }
6040     else {
6041         if ( aExp < 0x3FFF ) {
6042             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6043             status->float_exception_flags |= float_flag_inexact;
6044             aSign = extractFloat128Sign( a );
6045             switch (status->float_rounding_mode) {
6046              case float_round_nearest_even:
6047                 if (    ( aExp == 0x3FFE )
6048                      && (   extractFloat128Frac0( a )
6049                           | extractFloat128Frac1( a ) )
6050                    ) {
6051                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6052                 }
6053                 break;
6054             case float_round_ties_away:
6055                 if (aExp == 0x3FFE) {
6056                     return packFloat128(aSign, 0x3FFF, 0, 0);
6057                 }
6058                 break;
6059              case float_round_down:
6060                 return
6061                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6062                     : packFloat128( 0, 0, 0, 0 );
6063              case float_round_up:
6064                 return
6065                       aSign ? packFloat128( 1, 0, 0, 0 )
6066                     : packFloat128( 0, 0x3FFF, 0, 0 );
6067             }
6068             return packFloat128( aSign, 0, 0, 0 );
6069         }
6070         lastBitMask = 1;
6071         lastBitMask <<= 0x402F - aExp;
6072         roundBitsMask = lastBitMask - 1;
6073         z.low = 0;
6074         z.high = a.high;
6075         switch (status->float_rounding_mode) {
6076         case float_round_nearest_even:
6077             z.high += lastBitMask>>1;
6078             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6079                 z.high &= ~ lastBitMask;
6080             }
6081             break;
6082         case float_round_ties_away:
6083             z.high += lastBitMask>>1;
6084             break;
6085         case float_round_to_zero:
6086             break;
6087         case float_round_up:
6088             if (!extractFloat128Sign(z)) {
6089                 z.high |= ( a.low != 0 );
6090                 z.high += roundBitsMask;
6091             }
6092             break;
6093         case float_round_down:
6094             if (extractFloat128Sign(z)) {
6095                 z.high |= (a.low != 0);
6096                 z.high += roundBitsMask;
6097             }
6098             break;
6099         default:
6100             abort();
6101         }
6102         z.high &= ~ roundBitsMask;
6103     }
6104     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6105         status->float_exception_flags |= float_flag_inexact;
6106     }
6107     return z;
6108
6109 }
6110
6111 /*----------------------------------------------------------------------------
6112 | Returns the result of adding the absolute values of the quadruple-precision
6113 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6114 | before being returned.  `zSign' is ignored if the result is a NaN.
6115 | The addition is performed according to the IEC/IEEE Standard for Binary
6116 | Floating-Point Arithmetic.
6117 *----------------------------------------------------------------------------*/
6118
6119 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6120                                 float_status *status)
6121 {
6122     int32_t aExp, bExp, zExp;
6123     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6124     int32_t expDiff;
6125
6126     aSig1 = extractFloat128Frac1( a );
6127     aSig0 = extractFloat128Frac0( a );
6128     aExp = extractFloat128Exp( a );
6129     bSig1 = extractFloat128Frac1( b );
6130     bSig0 = extractFloat128Frac0( b );
6131     bExp = extractFloat128Exp( b );
6132     expDiff = aExp - bExp;
6133     if ( 0 < expDiff ) {
6134         if ( aExp == 0x7FFF ) {
6135             if (aSig0 | aSig1) {
6136                 return propagateFloat128NaN(a, b, status);
6137             }
6138             return a;
6139         }
6140         if ( bExp == 0 ) {
6141             --expDiff;
6142         }
6143         else {
6144             bSig0 |= LIT64( 0x0001000000000000 );
6145         }
6146         shift128ExtraRightJamming(
6147             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6148         zExp = aExp;
6149     }
6150     else if ( expDiff < 0 ) {
6151         if ( bExp == 0x7FFF ) {
6152             if (bSig0 | bSig1) {
6153                 return propagateFloat128NaN(a, b, status);
6154             }
6155             return packFloat128( zSign, 0x7FFF, 0, 0 );
6156         }
6157         if ( aExp == 0 ) {
6158             ++expDiff;
6159         }
6160         else {
6161             aSig0 |= LIT64( 0x0001000000000000 );
6162         }
6163         shift128ExtraRightJamming(
6164             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6165         zExp = bExp;
6166     }
6167     else {
6168         if ( aExp == 0x7FFF ) {
6169             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6170                 return propagateFloat128NaN(a, b, status);
6171             }
6172             return a;
6173         }
6174         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6175         if ( aExp == 0 ) {
6176             if (status->flush_to_zero) {
6177                 if (zSig0 | zSig1) {
6178                     float_raise(float_flag_output_denormal, status);
6179                 }
6180                 return packFloat128(zSign, 0, 0, 0);
6181             }
6182             return packFloat128( zSign, 0, zSig0, zSig1 );
6183         }
6184         zSig2 = 0;
6185         zSig0 |= LIT64( 0x0002000000000000 );
6186         zExp = aExp;
6187         goto shiftRight1;
6188     }
6189     aSig0 |= LIT64( 0x0001000000000000 );
6190     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6191     --zExp;
6192     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6193     ++zExp;
6194  shiftRight1:
6195     shift128ExtraRightJamming(
6196         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6197  roundAndPack:
6198     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6199
6200 }
6201
6202 /*----------------------------------------------------------------------------
6203 | Returns the result of subtracting the absolute values of the quadruple-
6204 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6205 | difference is negated before being returned.  `zSign' is ignored if the
6206 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6207 | Standard for Binary Floating-Point Arithmetic.
6208 *----------------------------------------------------------------------------*/
6209
6210 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6211                                 float_status *status)
6212 {
6213     int32_t aExp, bExp, zExp;
6214     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6215     int32_t expDiff;
6216
6217     aSig1 = extractFloat128Frac1( a );
6218     aSig0 = extractFloat128Frac0( a );
6219     aExp = extractFloat128Exp( a );
6220     bSig1 = extractFloat128Frac1( b );
6221     bSig0 = extractFloat128Frac0( b );
6222     bExp = extractFloat128Exp( b );
6223     expDiff = aExp - bExp;
6224     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6225     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6226     if ( 0 < expDiff ) goto aExpBigger;
6227     if ( expDiff < 0 ) goto bExpBigger;
6228     if ( aExp == 0x7FFF ) {
6229         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6230             return propagateFloat128NaN(a, b, status);
6231         }
6232         float_raise(float_flag_invalid, status);
6233         return float128_default_nan(status);
6234     }
6235     if ( aExp == 0 ) {
6236         aExp = 1;
6237         bExp = 1;
6238     }
6239     if ( bSig0 < aSig0 ) goto aBigger;
6240     if ( aSig0 < bSig0 ) goto bBigger;
6241     if ( bSig1 < aSig1 ) goto aBigger;
6242     if ( aSig1 < bSig1 ) goto bBigger;
6243     return packFloat128(status->float_rounding_mode == float_round_down,
6244                         0, 0, 0);
6245  bExpBigger:
6246     if ( bExp == 0x7FFF ) {
6247         if (bSig0 | bSig1) {
6248             return propagateFloat128NaN(a, b, status);
6249         }
6250         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6251     }
6252     if ( aExp == 0 ) {
6253         ++expDiff;
6254     }
6255     else {
6256         aSig0 |= LIT64( 0x4000000000000000 );
6257     }
6258     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6259     bSig0 |= LIT64( 0x4000000000000000 );
6260  bBigger:
6261     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6262     zExp = bExp;
6263     zSign ^= 1;
6264     goto normalizeRoundAndPack;
6265  aExpBigger:
6266     if ( aExp == 0x7FFF ) {
6267         if (aSig0 | aSig1) {
6268             return propagateFloat128NaN(a, b, status);
6269         }
6270         return a;
6271     }
6272     if ( bExp == 0 ) {
6273         --expDiff;
6274     }
6275     else {
6276         bSig0 |= LIT64( 0x4000000000000000 );
6277     }
6278     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6279     aSig0 |= LIT64( 0x4000000000000000 );
6280  aBigger:
6281     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6282     zExp = aExp;
6283  normalizeRoundAndPack:
6284     --zExp;
6285     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6286                                          status);
6287
6288 }
6289
6290 /*----------------------------------------------------------------------------
6291 | Returns the result of adding the quadruple-precision floating-point values
6292 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6293 | for Binary Floating-Point Arithmetic.
6294 *----------------------------------------------------------------------------*/
6295
6296 float128 float128_add(float128 a, float128 b, float_status *status)
6297 {
6298     flag aSign, bSign;
6299
6300     aSign = extractFloat128Sign( a );
6301     bSign = extractFloat128Sign( b );
6302     if ( aSign == bSign ) {
6303         return addFloat128Sigs(a, b, aSign, status);
6304     }
6305     else {
6306         return subFloat128Sigs(a, b, aSign, status);
6307     }
6308
6309 }
6310
6311 /*----------------------------------------------------------------------------
6312 | Returns the result of subtracting the quadruple-precision floating-point
6313 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6314 | Standard for Binary Floating-Point Arithmetic.
6315 *----------------------------------------------------------------------------*/
6316
6317 float128 float128_sub(float128 a, float128 b, float_status *status)
6318 {
6319     flag aSign, bSign;
6320
6321     aSign = extractFloat128Sign( a );
6322     bSign = extractFloat128Sign( b );
6323     if ( aSign == bSign ) {
6324         return subFloat128Sigs(a, b, aSign, status);
6325     }
6326     else {
6327         return addFloat128Sigs(a, b, aSign, status);
6328     }
6329
6330 }
6331
6332 /*----------------------------------------------------------------------------
6333 | Returns the result of multiplying the quadruple-precision floating-point
6334 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6335 | Standard for Binary Floating-Point Arithmetic.
6336 *----------------------------------------------------------------------------*/
6337
6338 float128 float128_mul(float128 a, float128 b, float_status *status)
6339 {
6340     flag aSign, bSign, zSign;
6341     int32_t aExp, bExp, zExp;
6342     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6343
6344     aSig1 = extractFloat128Frac1( a );
6345     aSig0 = extractFloat128Frac0( a );
6346     aExp = extractFloat128Exp( a );
6347     aSign = extractFloat128Sign( a );
6348     bSig1 = extractFloat128Frac1( b );
6349     bSig0 = extractFloat128Frac0( b );
6350     bExp = extractFloat128Exp( b );
6351     bSign = extractFloat128Sign( b );
6352     zSign = aSign ^ bSign;
6353     if ( aExp == 0x7FFF ) {
6354         if (    ( aSig0 | aSig1 )
6355              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6356             return propagateFloat128NaN(a, b, status);
6357         }
6358         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6359         return packFloat128( zSign, 0x7FFF, 0, 0 );
6360     }
6361     if ( bExp == 0x7FFF ) {
6362         if (bSig0 | bSig1) {
6363             return propagateFloat128NaN(a, b, status);
6364         }
6365         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6366  invalid:
6367             float_raise(float_flag_invalid, status);
6368             return float128_default_nan(status);
6369         }
6370         return packFloat128( zSign, 0x7FFF, 0, 0 );
6371     }
6372     if ( aExp == 0 ) {
6373         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6374         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6375     }
6376     if ( bExp == 0 ) {
6377         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6378         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6379     }
6380     zExp = aExp + bExp - 0x4000;
6381     aSig0 |= LIT64( 0x0001000000000000 );
6382     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6383     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6384     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6385     zSig2 |= ( zSig3 != 0 );
6386     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6387         shift128ExtraRightJamming(
6388             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6389         ++zExp;
6390     }
6391     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6392
6393 }
6394
6395 /*----------------------------------------------------------------------------
6396 | Returns the result of dividing the quadruple-precision floating-point value
6397 | `a' by the corresponding value `b'.  The operation is performed according to
6398 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6399 *----------------------------------------------------------------------------*/
6400
6401 float128 float128_div(float128 a, float128 b, float_status *status)
6402 {
6403     flag aSign, bSign, zSign;
6404     int32_t aExp, bExp, zExp;
6405     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6406     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6407
6408     aSig1 = extractFloat128Frac1( a );
6409     aSig0 = extractFloat128Frac0( a );
6410     aExp = extractFloat128Exp( a );
6411     aSign = extractFloat128Sign( a );
6412     bSig1 = extractFloat128Frac1( b );
6413     bSig0 = extractFloat128Frac0( b );
6414     bExp = extractFloat128Exp( b );
6415     bSign = extractFloat128Sign( b );
6416     zSign = aSign ^ bSign;
6417     if ( aExp == 0x7FFF ) {
6418         if (aSig0 | aSig1) {
6419             return propagateFloat128NaN(a, b, status);
6420         }
6421         if ( bExp == 0x7FFF ) {
6422             if (bSig0 | bSig1) {
6423                 return propagateFloat128NaN(a, b, status);
6424             }
6425             goto invalid;
6426         }
6427         return packFloat128( zSign, 0x7FFF, 0, 0 );
6428     }
6429     if ( bExp == 0x7FFF ) {
6430         if (bSig0 | bSig1) {
6431             return propagateFloat128NaN(a, b, status);
6432         }
6433         return packFloat128( zSign, 0, 0, 0 );
6434     }
6435     if ( bExp == 0 ) {
6436         if ( ( bSig0 | bSig1 ) == 0 ) {
6437             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6438  invalid:
6439                 float_raise(float_flag_invalid, status);
6440                 return float128_default_nan(status);
6441             }
6442             float_raise(float_flag_divbyzero, status);
6443             return packFloat128( zSign, 0x7FFF, 0, 0 );
6444         }
6445         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6446     }
6447     if ( aExp == 0 ) {
6448         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6449         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6450     }
6451     zExp = aExp - bExp + 0x3FFD;
6452     shortShift128Left(
6453         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6454     shortShift128Left(
6455         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6456     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6457         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6458         ++zExp;
6459     }
6460     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6461     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6462     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6463     while ( (int64_t) rem0 < 0 ) {
6464         --zSig0;
6465         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6466     }
6467     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6468     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6469         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6470         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6471         while ( (int64_t) rem1 < 0 ) {
6472             --zSig1;
6473             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6474         }
6475         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6476     }
6477     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6478     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6479
6480 }
6481
6482 /*----------------------------------------------------------------------------
6483 | Returns the remainder of the quadruple-precision floating-point value `a'
6484 | with respect to the corresponding value `b'.  The operation is performed
6485 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6486 *----------------------------------------------------------------------------*/
6487
6488 float128 float128_rem(float128 a, float128 b, float_status *status)
6489 {
6490     flag aSign, zSign;
6491     int32_t aExp, bExp, expDiff;
6492     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6493     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6494     int64_t sigMean0;
6495
6496     aSig1 = extractFloat128Frac1( a );
6497     aSig0 = extractFloat128Frac0( a );
6498     aExp = extractFloat128Exp( a );
6499     aSign = extractFloat128Sign( a );
6500     bSig1 = extractFloat128Frac1( b );
6501     bSig0 = extractFloat128Frac0( b );
6502     bExp = extractFloat128Exp( b );
6503     if ( aExp == 0x7FFF ) {
6504         if (    ( aSig0 | aSig1 )
6505              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6506             return propagateFloat128NaN(a, b, status);
6507         }
6508         goto invalid;
6509     }
6510     if ( bExp == 0x7FFF ) {
6511         if (bSig0 | bSig1) {
6512             return propagateFloat128NaN(a, b, status);
6513         }
6514         return a;
6515     }
6516     if ( bExp == 0 ) {
6517         if ( ( bSig0 | bSig1 ) == 0 ) {
6518  invalid:
6519             float_raise(float_flag_invalid, status);
6520             return float128_default_nan(status);
6521         }
6522         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6523     }
6524     if ( aExp == 0 ) {
6525         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6526         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6527     }
6528     expDiff = aExp - bExp;
6529     if ( expDiff < -1 ) return a;
6530     shortShift128Left(
6531         aSig0 | LIT64( 0x0001000000000000 ),
6532         aSig1,
6533         15 - ( expDiff < 0 ),
6534         &aSig0,
6535         &aSig1
6536     );
6537     shortShift128Left(
6538         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6539     q = le128( bSig0, bSig1, aSig0, aSig1 );
6540     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6541     expDiff -= 64;
6542     while ( 0 < expDiff ) {
6543         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6544         q = ( 4 < q ) ? q - 4 : 0;
6545         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6546         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6547         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6548         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6549         expDiff -= 61;
6550     }
6551     if ( -64 < expDiff ) {
6552         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6553         q = ( 4 < q ) ? q - 4 : 0;
6554         q >>= - expDiff;
6555         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6556         expDiff += 52;
6557         if ( expDiff < 0 ) {
6558             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6559         }
6560         else {
6561             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6562         }
6563         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6564         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6565     }
6566     else {
6567         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6568         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6569     }
6570     do {
6571         alternateASig0 = aSig0;
6572         alternateASig1 = aSig1;
6573         ++q;
6574         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6575     } while ( 0 <= (int64_t) aSig0 );
6576     add128(
6577         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6578     if (    ( sigMean0 < 0 )
6579          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6580         aSig0 = alternateASig0;
6581         aSig1 = alternateASig1;
6582     }
6583     zSign = ( (int64_t) aSig0 < 0 );
6584     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6585     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6586                                          status);
6587 }
6588
6589 /*----------------------------------------------------------------------------
6590 | Returns the square root of the quadruple-precision floating-point value `a'.
6591 | The operation is performed according to the IEC/IEEE Standard for Binary
6592 | Floating-Point Arithmetic.
6593 *----------------------------------------------------------------------------*/
6594
6595 float128 float128_sqrt(float128 a, float_status *status)
6596 {
6597     flag aSign;
6598     int32_t aExp, zExp;
6599     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6600     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6601
6602     aSig1 = extractFloat128Frac1( a );
6603     aSig0 = extractFloat128Frac0( a );
6604     aExp = extractFloat128Exp( a );
6605     aSign = extractFloat128Sign( a );
6606     if ( aExp == 0x7FFF ) {
6607         if (aSig0 | aSig1) {
6608             return propagateFloat128NaN(a, a, status);
6609         }
6610         if ( ! aSign ) return a;
6611         goto invalid;
6612     }
6613     if ( aSign ) {
6614         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6615  invalid:
6616         float_raise(float_flag_invalid, status);
6617         return float128_default_nan(status);
6618     }
6619     if ( aExp == 0 ) {
6620         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6621         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6622     }
6623     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6624     aSig0 |= LIT64( 0x0001000000000000 );
6625     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6626     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6627     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6628     doubleZSig0 = zSig0<<1;
6629     mul64To128( zSig0, zSig0, &term0, &term1 );
6630     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6631     while ( (int64_t) rem0 < 0 ) {
6632         --zSig0;
6633         doubleZSig0 -= 2;
6634         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6635     }
6636     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6637     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6638         if ( zSig1 == 0 ) zSig1 = 1;
6639         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6640         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6641         mul64To128( zSig1, zSig1, &term2, &term3 );
6642         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6643         while ( (int64_t) rem1 < 0 ) {
6644             --zSig1;
6645             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6646             term3 |= 1;
6647             term2 |= doubleZSig0;
6648             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6649         }
6650         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6651     }
6652     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6653     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6654
6655 }
6656
6657 /*----------------------------------------------------------------------------
6658 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6659 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6660 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6661 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6662 *----------------------------------------------------------------------------*/
6663
6664 int float128_eq(float128 a, float128 b, float_status *status)
6665 {
6666
6667     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6668               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6669          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6670               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6671        ) {
6672         float_raise(float_flag_invalid, status);
6673         return 0;
6674     }
6675     return
6676            ( a.low == b.low )
6677         && (    ( a.high == b.high )
6678              || (    ( a.low == 0 )
6679                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6680            );
6681
6682 }
6683
6684 /*----------------------------------------------------------------------------
6685 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6686 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6687 | exception is raised if either operand is a NaN.  The comparison is performed
6688 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6689 *----------------------------------------------------------------------------*/
6690
6691 int float128_le(float128 a, float128 b, float_status *status)
6692 {
6693     flag aSign, bSign;
6694
6695     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6696               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6697          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6698               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6699        ) {
6700         float_raise(float_flag_invalid, status);
6701         return 0;
6702     }
6703     aSign = extractFloat128Sign( a );
6704     bSign = extractFloat128Sign( b );
6705     if ( aSign != bSign ) {
6706         return
6707                aSign
6708             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6709                  == 0 );
6710     }
6711     return
6712           aSign ? le128( b.high, b.low, a.high, a.low )
6713         : le128( a.high, a.low, b.high, b.low );
6714
6715 }
6716
6717 /*----------------------------------------------------------------------------
6718 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6719 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6720 | raised if either operand is a NaN.  The comparison is performed according
6721 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6722 *----------------------------------------------------------------------------*/
6723
6724 int float128_lt(float128 a, float128 b, float_status *status)
6725 {
6726     flag aSign, bSign;
6727
6728     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6729               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6730          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6731               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6732        ) {
6733         float_raise(float_flag_invalid, status);
6734         return 0;
6735     }
6736     aSign = extractFloat128Sign( a );
6737     bSign = extractFloat128Sign( b );
6738     if ( aSign != bSign ) {
6739         return
6740                aSign
6741             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6742                  != 0 );
6743     }
6744     return
6745           aSign ? lt128( b.high, b.low, a.high, a.low )
6746         : lt128( a.high, a.low, b.high, b.low );
6747
6748 }
6749
6750 /*----------------------------------------------------------------------------
6751 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6752 | be compared, and 0 otherwise.  The invalid exception is raised if either
6753 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6754 | Standard for Binary Floating-Point Arithmetic.
6755 *----------------------------------------------------------------------------*/
6756
6757 int float128_unordered(float128 a, float128 b, float_status *status)
6758 {
6759     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6760               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6761          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6762               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6763        ) {
6764         float_raise(float_flag_invalid, status);
6765         return 1;
6766     }
6767     return 0;
6768 }
6769
6770 /*----------------------------------------------------------------------------
6771 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6772 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6773 | exception.  The comparison is performed according to the IEC/IEEE Standard
6774 | for Binary Floating-Point Arithmetic.
6775 *----------------------------------------------------------------------------*/
6776
6777 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6778 {
6779
6780     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6781               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6782          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6783               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6784        ) {
6785         if (float128_is_signaling_nan(a, status)
6786          || float128_is_signaling_nan(b, status)) {
6787             float_raise(float_flag_invalid, status);
6788         }
6789         return 0;
6790     }
6791     return
6792            ( a.low == b.low )
6793         && (    ( a.high == b.high )
6794              || (    ( a.low == 0 )
6795                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6796            );
6797
6798 }
6799
6800 /*----------------------------------------------------------------------------
6801 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6802 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6803 | cause an exception.  Otherwise, the comparison is performed according to the
6804 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6805 *----------------------------------------------------------------------------*/
6806
6807 int float128_le_quiet(float128 a, float128 b, float_status *status)
6808 {
6809     flag aSign, bSign;
6810
6811     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6812               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6813          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6814               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6815        ) {
6816         if (float128_is_signaling_nan(a, status)
6817          || float128_is_signaling_nan(b, status)) {
6818             float_raise(float_flag_invalid, status);
6819         }
6820         return 0;
6821     }
6822     aSign = extractFloat128Sign( a );
6823     bSign = extractFloat128Sign( b );
6824     if ( aSign != bSign ) {
6825         return
6826                aSign
6827             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6828                  == 0 );
6829     }
6830     return
6831           aSign ? le128( b.high, b.low, a.high, a.low )
6832         : le128( a.high, a.low, b.high, b.low );
6833
6834 }
6835
6836 /*----------------------------------------------------------------------------
6837 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6838 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6839 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6840 | Standard for Binary Floating-Point Arithmetic.
6841 *----------------------------------------------------------------------------*/
6842
6843 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6844 {
6845     flag aSign, bSign;
6846
6847     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6848               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6849          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6850               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6851        ) {
6852         if (float128_is_signaling_nan(a, status)
6853          || float128_is_signaling_nan(b, status)) {
6854             float_raise(float_flag_invalid, status);
6855         }
6856         return 0;
6857     }
6858     aSign = extractFloat128Sign( a );
6859     bSign = extractFloat128Sign( b );
6860     if ( aSign != bSign ) {
6861         return
6862                aSign
6863             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6864                  != 0 );
6865     }
6866     return
6867           aSign ? lt128( b.high, b.low, a.high, a.low )
6868         : lt128( a.high, a.low, b.high, b.low );
6869
6870 }
6871
6872 /*----------------------------------------------------------------------------
6873 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6874 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6875 | comparison is performed according to the IEC/IEEE Standard for Binary
6876 | Floating-Point Arithmetic.
6877 *----------------------------------------------------------------------------*/
6878
6879 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6880 {
6881     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6882               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6883          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6884               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6885        ) {
6886         if (float128_is_signaling_nan(a, status)
6887          || float128_is_signaling_nan(b, status)) {
6888             float_raise(float_flag_invalid, status);
6889         }
6890         return 1;
6891     }
6892     return 0;
6893 }
6894
6895 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6896                                             int is_quiet, float_status *status)
6897 {
6898     flag aSign, bSign;
6899
6900     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6901         float_raise(float_flag_invalid, status);
6902         return float_relation_unordered;
6903     }
6904     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6905           ( extractFloatx80Frac( a )<<1 ) ) ||
6906         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6907           ( extractFloatx80Frac( b )<<1 ) )) {
6908         if (!is_quiet ||
6909             floatx80_is_signaling_nan(a, status) ||
6910             floatx80_is_signaling_nan(b, status)) {
6911             float_raise(float_flag_invalid, status);
6912         }
6913         return float_relation_unordered;
6914     }
6915     aSign = extractFloatx80Sign( a );
6916     bSign = extractFloatx80Sign( b );
6917     if ( aSign != bSign ) {
6918
6919         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6920              ( ( a.low | b.low ) == 0 ) ) {
6921             /* zero case */
6922             return float_relation_equal;
6923         } else {
6924             return 1 - (2 * aSign);
6925         }
6926     } else {
6927         if (a.low == b.low && a.high == b.high) {
6928             return float_relation_equal;
6929         } else {
6930             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6931         }
6932     }
6933 }
6934
6935 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6936 {
6937     return floatx80_compare_internal(a, b, 0, status);
6938 }
6939
6940 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6941 {
6942     return floatx80_compare_internal(a, b, 1, status);
6943 }
6944
6945 static inline int float128_compare_internal(float128 a, float128 b,
6946                                             int is_quiet, float_status *status)
6947 {
6948     flag aSign, bSign;
6949
6950     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6951           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6952         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6953           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6954         if (!is_quiet ||
6955             float128_is_signaling_nan(a, status) ||
6956             float128_is_signaling_nan(b, status)) {
6957             float_raise(float_flag_invalid, status);
6958         }
6959         return float_relation_unordered;
6960     }
6961     aSign = extractFloat128Sign( a );
6962     bSign = extractFloat128Sign( b );
6963     if ( aSign != bSign ) {
6964         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6965             /* zero case */
6966             return float_relation_equal;
6967         } else {
6968             return 1 - (2 * aSign);
6969         }
6970     } else {
6971         if (a.low == b.low && a.high == b.high) {
6972             return float_relation_equal;
6973         } else {
6974             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6975         }
6976     }
6977 }
6978
6979 int float128_compare(float128 a, float128 b, float_status *status)
6980 {
6981     return float128_compare_internal(a, b, 0, status);
6982 }
6983
6984 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6985 {
6986     return float128_compare_internal(a, b, 1, status);
6987 }
6988
6989 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6990 {
6991     flag aSign;
6992     int32_t aExp;
6993     uint64_t aSig;
6994
6995     if (floatx80_invalid_encoding(a)) {
6996         float_raise(float_flag_invalid, status);
6997         return floatx80_default_nan(status);
6998     }
6999     aSig = extractFloatx80Frac( a );
7000     aExp = extractFloatx80Exp( a );
7001     aSign = extractFloatx80Sign( a );
7002
7003     if ( aExp == 0x7FFF ) {
7004         if ( aSig<<1 ) {
7005             return propagateFloatx80NaN(a, a, status);
7006         }
7007         return a;
7008     }
7009
7010     if (aExp == 0) {
7011         if (aSig == 0) {
7012             return a;
7013         }
7014         aExp++;
7015     }
7016
7017     if (n > 0x10000) {
7018         n = 0x10000;
7019     } else if (n < -0x10000) {
7020         n = -0x10000;
7021     }
7022
7023     aExp += n;
7024     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7025                                          aSign, aExp, aSig, 0, status);
7026 }
7027
7028 float128 float128_scalbn(float128 a, int n, float_status *status)
7029 {
7030     flag aSign;
7031     int32_t aExp;
7032     uint64_t aSig0, aSig1;
7033
7034     aSig1 = extractFloat128Frac1( a );
7035     aSig0 = extractFloat128Frac0( a );
7036     aExp = extractFloat128Exp( a );
7037     aSign = extractFloat128Sign( a );
7038     if ( aExp == 0x7FFF ) {
7039         if ( aSig0 | aSig1 ) {
7040             return propagateFloat128NaN(a, a, status);
7041         }
7042         return a;
7043     }
7044     if (aExp != 0) {
7045         aSig0 |= LIT64( 0x0001000000000000 );
7046     } else if (aSig0 == 0 && aSig1 == 0) {
7047         return a;
7048     } else {
7049         aExp++;
7050     }
7051
7052     if (n > 0x10000) {
7053         n = 0x10000;
7054     } else if (n < -0x10000) {
7055         n = -0x10000;
7056     }
7057
7058     aExp += n - 1;
7059     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7060                                          , status);
7061
7062 }