fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * The code in this source file is derived from release 2a of the SoftFloat
   5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
   6  * some later contributions) are provided under that license, as detailed below.
   7  * It has subsequently been modified by contributors to the QEMU Project,
   8  * so some portions are provided under:
   9  *  the SoftFloat-2a license
  10  *  the BSD license
  11  *  GPL-v2-or-later
  12  *
  13  * Any future contributions to this file after December 1st 2014 will be
  14  * taken to be licensed under the Softfloat-2a license unless specifically
  15  * indicated otherwise.
  16  */
  17
  18 /*
  19 ===============================================================================
  20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
  21 Arithmetic Package, Release 2a.
  22
  23 Written by John R. Hauser.  This work was made possible in part by the
  24 International Computer Science Institute, located at Suite 600, 1947 Center
  25 Street, Berkeley, California 94704.  Funding was partially provided by the
  26 National Science Foundation under grant MIP-9311980.  The original version
  27 of this code was written as part of a project to build a fixed-point vector
  28 processor in collaboration with the University of California at Berkeley,
  29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  31 arithmetic/SoftFloat.html'.
  32
  33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
  34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
  36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  38
  39 Derivative works are acceptable, even for commercial purposes, so long as
  40 (1) they include prominent notice that the work is derivative, and (2) they
  41 include prominent notice akin to these four paragraphs for those parts of
  42 this code that are retained.
  43
  44 ===============================================================================
  45 */
  46
  47 /* BSD licensing:
  48  * Copyright (c) 2006, Fabrice Bellard
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are met:
  53  *
  54  * 1. Redistributions of source code must retain the above copyright notice,
  55  * this list of conditions and the following disclaimer.
  56  *
  57  * 2. Redistributions in binary form must reproduce the above copyright notice,
  58  * this list of conditions and the following disclaimer in the documentation
  59  * and/or other materials provided with the distribution.
  60  *
  61  * 3. Neither the name of the copyright holder nor the names of its contributors
  62  * may be used to endorse or promote products derived from this software without
  63  * specific prior written permission.
  64  *
  65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  75  * THE POSSIBILITY OF SUCH DAMAGE.
  76  */
  77
  78 /* Portions of this work are licensed under the terms of the GNU GPL,
  79  * version 2 or later. See the COPYING file in the top-level directory.
  80  */
  81
  82 /* softfloat (and in particular the code in softfloat-specialize.h) is
  83  * target-dependent and needs the TARGET_* macros.
  84  */
  85 #include "qemu/osdep.h"
  86
  87 #include "fpu/softfloat.h"
  88
  89 /* We only need stdlib for abort() */
  90
  91 /*----------------------------------------------------------------------------
  92 | Primitive arithmetic functions, including multi-word arithmetic, and
  93 | division and square root approximations.  (Can be specialized to target if
  94 | desired.)
  95 *----------------------------------------------------------------------------*/
  96 #include "softfloat-macros.h"
  97
  98 /*----------------------------------------------------------------------------
  99 | Functions and definitions to determine:  (1) whether tininess for underflow
 100 | is detected before or after rounding by default, (2) what (if anything)
 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
 103 | are propagated from function inputs to output.  These details are target-
 104 | specific.
 105 *----------------------------------------------------------------------------*/
 106 #include "softfloat-specialize.h"
 107
 108 /*----------------------------------------------------------------------------
 109 | Returns the fraction bits of the half-precision floating-point value `a'.
 110 *----------------------------------------------------------------------------*/
 111
 112 static inline uint32_t extractFloat16Frac(float16 a)
 113 {
 114     return float16_val(a) & 0x3ff;
 115 }
 116
 117 /*----------------------------------------------------------------------------
 118 | Returns the exponent bits of the half-precision floating-point value `a'.
 119 *----------------------------------------------------------------------------*/
 120
 121 static inline int extractFloat16Exp(float16 a)
 122 {
 123     return (float16_val(a) >> 10) & 0x1f;
 124 }
 125
 126 /*----------------------------------------------------------------------------
 127 | Returns the sign bit of the single-precision floating-point value `a'.
 128 *----------------------------------------------------------------------------*/
 129
 130 static inline flag extractFloat16Sign(float16 a)
 131 {
 132     return float16_val(a)>>15;
 133 }
 134
 135 /*----------------------------------------------------------------------------
 136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 138 | input.  If `zSign' is 1, the input is negated before being converted to an
 139 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 140 | is simply rounded to an integer, with the inexact exception raised if the
 141 | input cannot be represented exactly as an integer.  However, if the fixed-
 142 | point input is too large, the invalid exception is raised and the largest
 143 | positive or negative integer is returned.
 144 *----------------------------------------------------------------------------*/
 145
 146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
 147 {
 148     int8_t roundingMode;
 149     flag roundNearestEven;
 150     int8_t roundIncrement, roundBits;
 151     int32_t z;
 152
 153     roundingMode = status->float_rounding_mode;
 154     roundNearestEven = ( roundingMode == float_round_nearest_even );
 155     switch (roundingMode) {
 156     case float_round_nearest_even:
 157     case float_round_ties_away:
 158         roundIncrement = 0x40;
 159         break;
 160     case float_round_to_zero:
 161         roundIncrement = 0;
 162         break;
 163     case float_round_up:
 164         roundIncrement = zSign ? 0 : 0x7f;
 165         break;
 166     case float_round_down:
 167         roundIncrement = zSign ? 0x7f : 0;
 168         break;
 169     default:
 170         abort();
 171     }
 172     roundBits = absZ & 0x7F;
 173     absZ = ( absZ + roundIncrement )>>7;
 174     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 175     z = absZ;
 176     if ( zSign ) z = - z;
 177     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 178         float_raise(float_flag_invalid, status);
 179         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 180     }
 181     if (roundBits) {
 182         status->float_exception_flags |= float_flag_inexact;
 183     }
 184     return z;
 185
 186 }
 187
 188 /*----------------------------------------------------------------------------
 189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 191 | and returns the properly rounded 64-bit integer corresponding to the input.
 192 | If `zSign' is 1, the input is negated before being converted to an integer.
 193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 194 | the inexact exception raised if the input cannot be represented exactly as
 195 | an integer.  However, if the fixed-point input is too large, the invalid
 196 | exception is raised and the largest positive or negative integer is
 197 | returned.
 198 *----------------------------------------------------------------------------*/
 199
 200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
 201                                float_status *status)
 202 {
 203     int8_t roundingMode;
 204     flag roundNearestEven, increment;
 205     int64_t z;
 206
 207     roundingMode = status->float_rounding_mode;
 208     roundNearestEven = ( roundingMode == float_round_nearest_even );
 209     switch (roundingMode) {
 210     case float_round_nearest_even:
 211     case float_round_ties_away:
 212         increment = ((int64_t) absZ1 < 0);
 213         break;
 214     case float_round_to_zero:
 215         increment = 0;
 216         break;
 217     case float_round_up:
 218         increment = !zSign && absZ1;
 219         break;
 220     case float_round_down:
 221         increment = zSign && absZ1;
 222         break;
 223     default:
 224         abort();
 225     }
 226     if ( increment ) {
 227         ++absZ0;
 228         if ( absZ0 == 0 ) goto overflow;
 229         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 230     }
 231     z = absZ0;
 232     if ( zSign ) z = - z;
 233     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 234  overflow:
 235         float_raise(float_flag_invalid, status);
 236         return
 237               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 238             : LIT64( 0x7FFFFFFFFFFFFFFF );
 239     }
 240     if (absZ1) {
 241         status->float_exception_flags |= float_flag_inexact;
 242     }
 243     return z;
 244
 245 }
 246
 247 /*----------------------------------------------------------------------------
 248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
 251 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
 252 | with the inexact exception raised if the input cannot be represented exactly
 253 | as an integer.  However, if the fixed-point input is too large, the invalid
 254 | exception is raised and the largest unsigned integer is returned.
 255 *----------------------------------------------------------------------------*/
 256
 257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
 258                                 uint64_t absZ1, float_status *status)
 259 {
 260     int8_t roundingMode;
 261     flag roundNearestEven, increment;
 262
 263     roundingMode = status->float_rounding_mode;
 264     roundNearestEven = (roundingMode == float_round_nearest_even);
 265     switch (roundingMode) {
 266     case float_round_nearest_even:
 267     case float_round_ties_away:
 268         increment = ((int64_t)absZ1 < 0);
 269         break;
 270     case float_round_to_zero:
 271         increment = 0;
 272         break;
 273     case float_round_up:
 274         increment = !zSign && absZ1;
 275         break;
 276     case float_round_down:
 277         increment = zSign && absZ1;
 278         break;
 279     default:
 280         abort();
 281     }
 282     if (increment) {
 283         ++absZ0;
 284         if (absZ0 == 0) {
 285             float_raise(float_flag_invalid, status);
 286             return LIT64(0xFFFFFFFFFFFFFFFF);
 287         }
 288         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
 289     }
 290
 291     if (zSign && absZ0) {
 292         float_raise(float_flag_invalid, status);
 293         return 0;
 294     }
 295
 296     if (absZ1) {
 297         status->float_exception_flags |= float_flag_inexact;
 298     }
 299     return absZ0;
 300 }
 301
 302 /*----------------------------------------------------------------------------
 303 | Returns the fraction bits of the single-precision floating-point value `a'.
 304 *----------------------------------------------------------------------------*/
 305
 306 static inline uint32_t extractFloat32Frac( float32 a )
 307 {
 308
 309     return float32_val(a) & 0x007FFFFF;
 310
 311 }
 312
 313 /*----------------------------------------------------------------------------
 314 | Returns the exponent bits of the single-precision floating-point value `a'.
 315 *----------------------------------------------------------------------------*/
 316
 317 static inline int extractFloat32Exp(float32 a)
 318 {
 319
 320     return ( float32_val(a)>>23 ) & 0xFF;
 321
 322 }
 323
 324 /*----------------------------------------------------------------------------
 325 | Returns the sign bit of the single-precision floating-point value `a'.
 326 *----------------------------------------------------------------------------*/
 327
 328 static inline flag extractFloat32Sign( float32 a )
 329 {
 330
 331     return float32_val(a)>>31;
 332
 333 }
 334
 335 /*----------------------------------------------------------------------------
 336 | If `a' is denormal and we are in flush-to-zero mode then set the
 337 | input-denormal exception and return zero. Otherwise just return the value.
 338 *----------------------------------------------------------------------------*/
 339 float32 float32_squash_input_denormal(float32 a, float_status *status)
 340 {
 341     if (status->flush_inputs_to_zero) {
 342         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 343             float_raise(float_flag_input_denormal, status);
 344             return make_float32(float32_val(a) & 0x80000000);
 345         }
 346     }
 347     return a;
 348 }
 349
 350 /*----------------------------------------------------------------------------
 351 | Normalizes the subnormal single-precision floating-point value represented
 352 | by the denormalized significand `aSig'.  The normalized exponent and
 353 | significand are stored at the locations pointed to by `zExpPtr' and
 354 | `zSigPtr', respectively.
 355 *----------------------------------------------------------------------------*/
 356
 357 static void
 358  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
 359 {
 360     int8_t shiftCount;
 361
 362     shiftCount = countLeadingZeros32( aSig ) - 8;
 363     *zSigPtr = aSig<<shiftCount;
 364     *zExpPtr = 1 - shiftCount;
 365
 366 }
 367
 368 /*----------------------------------------------------------------------------
 369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 370 | single-precision floating-point value, returning the result.  After being
 371 | shifted into the proper positions, the three fields are simply added
 372 | together to form the result.  This means that any integer portion of `zSig'
 373 | will be added into the exponent.  Since a properly normalized significand
 374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 375 | than the desired result exponent whenever `zSig' is a complete, normalized
 376 | significand.
 377 *----------------------------------------------------------------------------*/
 378
 379 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
 380 {
 381
 382     return make_float32(
 383           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 384
 385 }
 386
 387 /*----------------------------------------------------------------------------
 388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 389 | and significand `zSig', and returns the proper single-precision floating-
 390 | point value corresponding to the abstract input.  Ordinarily, the abstract
 391 | value is simply rounded and packed into the single-precision format, with
 392 | the inexact exception raised if the abstract input cannot be represented
 393 | exactly.  However, if the abstract value is too large, the overflow and
 394 | inexact exceptions are raised and an infinity or maximal finite value is
 395 | returned.  If the abstract value is too small, the input value is rounded to
 396 | a subnormal number, and the underflow and inexact exceptions are raised if
 397 | the abstract input cannot be represented exactly as a subnormal single-
 398 | precision floating-point number.
 399 |     The input significand `zSig' has its binary point between bits 30
 400 | and 29, which is 7 bits to the left of the usual location.  This shifted
 401 | significand must be normalized or smaller.  If `zSig' is not normalized,
 402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 403 | and it must not require rounding.  In the usual case that `zSig' is
 404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 406 | Binary Floating-Point Arithmetic.
 407 *----------------------------------------------------------------------------*/
 408
 409 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
 410                                    float_status *status)
 411 {
 412     int8_t roundingMode;
 413     flag roundNearestEven;
 414     int8_t roundIncrement, roundBits;
 415     flag isTiny;
 416
 417     roundingMode = status->float_rounding_mode;
 418     roundNearestEven = ( roundingMode == float_round_nearest_even );
 419     switch (roundingMode) {
 420     case float_round_nearest_even:
 421     case float_round_ties_away:
 422         roundIncrement = 0x40;
 423         break;
 424     case float_round_to_zero:
 425         roundIncrement = 0;
 426         break;
 427     case float_round_up:
 428         roundIncrement = zSign ? 0 : 0x7f;
 429         break;
 430     case float_round_down:
 431         roundIncrement = zSign ? 0x7f : 0;
 432         break;
 433     default:
 434         abort();
 435         break;
 436     }
 437     roundBits = zSig & 0x7F;
 438     if ( 0xFD <= (uint16_t) zExp ) {
 439         if (    ( 0xFD < zExp )
 440              || (    ( zExp == 0xFD )
 441                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 442            ) {
 443             float_raise(float_flag_overflow | float_flag_inexact, status);
 444             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 445         }
 446         if ( zExp < 0 ) {
 447             if (status->flush_to_zero) {
 448                 float_raise(float_flag_output_denormal, status);
 449                 return packFloat32(zSign, 0, 0);
 450             }
 451             isTiny =
 452                 (status->float_detect_tininess
 453                  == float_tininess_before_rounding)
 454                 || ( zExp < -1 )
 455                 || ( zSig + roundIncrement < 0x80000000 );
 456             shift32RightJamming( zSig, - zExp, &zSig );
 457             zExp = 0;
 458             roundBits = zSig & 0x7F;
 459             if (isTiny && roundBits) {
 460                 float_raise(float_flag_underflow, status);
 461             }
 462         }
 463     }
 464     if (roundBits) {
 465         status->float_exception_flags |= float_flag_inexact;
 466     }
 467     zSig = ( zSig + roundIncrement )>>7;
 468     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 469     if ( zSig == 0 ) zExp = 0;
 470     return packFloat32( zSign, zExp, zSig );
 471
 472 }
 473
 474 /*----------------------------------------------------------------------------
 475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 476 | and significand `zSig', and returns the proper single-precision floating-
 477 | point value corresponding to the abstract input.  This routine is just like
 478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 480 | floating-point exponent.
 481 *----------------------------------------------------------------------------*/
 482
 483 static float32
 484  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
 485                               float_status *status)
 486 {
 487     int8_t shiftCount;
 488
 489     shiftCount = countLeadingZeros32( zSig ) - 1;
 490     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
 491                                status);
 492
 493 }
 494
 495 /*----------------------------------------------------------------------------
 496 | Returns the fraction bits of the double-precision floating-point value `a'.
 497 *----------------------------------------------------------------------------*/
 498
 499 static inline uint64_t extractFloat64Frac( float64 a )
 500 {
 501
 502     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 503
 504 }
 505
 506 /*----------------------------------------------------------------------------
 507 | Returns the exponent bits of the double-precision floating-point value `a'.
 508 *----------------------------------------------------------------------------*/
 509
 510 static inline int extractFloat64Exp(float64 a)
 511 {
 512
 513     return ( float64_val(a)>>52 ) & 0x7FF;
 514
 515 }
 516
 517 /*----------------------------------------------------------------------------
 518 | Returns the sign bit of the double-precision floating-point value `a'.
 519 *----------------------------------------------------------------------------*/
 520
 521 static inline flag extractFloat64Sign( float64 a )
 522 {
 523
 524     return float64_val(a)>>63;
 525
 526 }
 527
 528 /*----------------------------------------------------------------------------
 529 | If `a' is denormal and we are in flush-to-zero mode then set the
 530 | input-denormal exception and return zero. Otherwise just return the value.
 531 *----------------------------------------------------------------------------*/
 532 float64 float64_squash_input_denormal(float64 a, float_status *status)
 533 {
 534     if (status->flush_inputs_to_zero) {
 535         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 536             float_raise(float_flag_input_denormal, status);
 537             return make_float64(float64_val(a) & (1ULL << 63));
 538         }
 539     }
 540     return a;
 541 }
 542
 543 /*----------------------------------------------------------------------------
 544 | Normalizes the subnormal double-precision floating-point value represented
 545 | by the denormalized significand `aSig'.  The normalized exponent and
 546 | significand are stored at the locations pointed to by `zExpPtr' and
 547 | `zSigPtr', respectively.
 548 *----------------------------------------------------------------------------*/
 549
 550 static void
 551  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
 552 {
 553     int8_t shiftCount;
 554
 555     shiftCount = countLeadingZeros64( aSig ) - 11;
 556     *zSigPtr = aSig<<shiftCount;
 557     *zExpPtr = 1 - shiftCount;
 558
 559 }
 560
 561 /*----------------------------------------------------------------------------
 562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 563 | double-precision floating-point value, returning the result.  After being
 564 | shifted into the proper positions, the three fields are simply added
 565 | together to form the result.  This means that any integer portion of `zSig'
 566 | will be added into the exponent.  Since a properly normalized significand
 567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 568 | than the desired result exponent whenever `zSig' is a complete, normalized
 569 | significand.
 570 *----------------------------------------------------------------------------*/
 571
 572 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
 573 {
 574
 575     return make_float64(
 576         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 577
 578 }
 579
 580 /*----------------------------------------------------------------------------
 581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 582 | and significand `zSig', and returns the proper double-precision floating-
 583 | point value corresponding to the abstract input.  Ordinarily, the abstract
 584 | value is simply rounded and packed into the double-precision format, with
 585 | the inexact exception raised if the abstract input cannot be represented
 586 | exactly.  However, if the abstract value is too large, the overflow and
 587 | inexact exceptions are raised and an infinity or maximal finite value is
 588 | returned.  If the abstract value is too small, the input value is rounded to
 589 | a subnormal number, and the underflow and inexact exceptions are raised if
 590 | the abstract input cannot be represented exactly as a subnormal double-
 591 | precision floating-point number.
 592 |     The input significand `zSig' has its binary point between bits 62
 593 | and 61, which is 10 bits to the left of the usual location.  This shifted
 594 | significand must be normalized or smaller.  If `zSig' is not normalized,
 595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 596 | and it must not require rounding.  In the usual case that `zSig' is
 597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 599 | Binary Floating-Point Arithmetic.
 600 *----------------------------------------------------------------------------*/
 601
 602 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
 603                                    float_status *status)
 604 {
 605     int8_t roundingMode;
 606     flag roundNearestEven;
 607     int roundIncrement, roundBits;
 608     flag isTiny;
 609
 610     roundingMode = status->float_rounding_mode;
 611     roundNearestEven = ( roundingMode == float_round_nearest_even );
 612     switch (roundingMode) {
 613     case float_round_nearest_even:
 614     case float_round_ties_away:
 615         roundIncrement = 0x200;
 616         break;
 617     case float_round_to_zero:
 618         roundIncrement = 0;
 619         break;
 620     case float_round_up:
 621         roundIncrement = zSign ? 0 : 0x3ff;
 622         break;
 623     case float_round_down:
 624         roundIncrement = zSign ? 0x3ff : 0;
 625         break;
 626     case float_round_to_odd:
 627         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
 628         break;
 629     default:
 630         abort();
 631     }
 632     roundBits = zSig & 0x3FF;
 633     if ( 0x7FD <= (uint16_t) zExp ) {
 634         if (    ( 0x7FD < zExp )
 635              || (    ( zExp == 0x7FD )
 636                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 637            ) {
 638             bool overflow_to_inf = roundingMode != float_round_to_odd &&
 639                                    roundIncrement != 0;
 640             float_raise(float_flag_overflow | float_flag_inexact, status);
 641             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
 642         }
 643         if ( zExp < 0 ) {
 644             if (status->flush_to_zero) {
 645                 float_raise(float_flag_output_denormal, status);
 646                 return packFloat64(zSign, 0, 0);
 647             }
 648             isTiny =
 649                    (status->float_detect_tininess
 650                     == float_tininess_before_rounding)
 651                 || ( zExp < -1 )
 652                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 653             shift64RightJamming( zSig, - zExp, &zSig );
 654             zExp = 0;
 655             roundBits = zSig & 0x3FF;
 656             if (isTiny && roundBits) {
 657                 float_raise(float_flag_underflow, status);
 658             }
 659             if (roundingMode == float_round_to_odd) {
 660                 /*
 661                  * For round-to-odd case, the roundIncrement depends on
 662                  * zSig which just changed.
 663                  */
 664                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
 665             }
 666         }
 667     }
 668     if (roundBits) {
 669         status->float_exception_flags |= float_flag_inexact;
 670     }
 671     zSig = ( zSig + roundIncrement )>>10;
 672     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 673     if ( zSig == 0 ) zExp = 0;
 674     return packFloat64( zSign, zExp, zSig );
 675
 676 }
 677
 678 /*----------------------------------------------------------------------------
 679 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 680 | and significand `zSig', and returns the proper double-precision floating-
 681 | point value corresponding to the abstract input.  This routine is just like
 682 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 683 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 684 | floating-point exponent.
 685 *----------------------------------------------------------------------------*/
 686
 687 static float64
 688  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
 689                               float_status *status)
 690 {
 691     int8_t shiftCount;
 692
 693     shiftCount = countLeadingZeros64( zSig ) - 1;
 694     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
 695                                status);
 696
 697 }
 698
 699 /*----------------------------------------------------------------------------
 700 | Returns the fraction bits of the extended double-precision floating-point
 701 | value `a'.
 702 *----------------------------------------------------------------------------*/
 703
 704 static inline uint64_t extractFloatx80Frac( floatx80 a )
 705 {
 706
 707     return a.low;
 708
 709 }
 710
 711 /*----------------------------------------------------------------------------
 712 | Returns the exponent bits of the extended double-precision floating-point
 713 | value `a'.
 714 *----------------------------------------------------------------------------*/
 715
 716 static inline int32_t extractFloatx80Exp( floatx80 a )
 717 {
 718
 719     return a.high & 0x7FFF;
 720
 721 }
 722
 723 /*----------------------------------------------------------------------------
 724 | Returns the sign bit of the extended double-precision floating-point value
 725 | `a'.
 726 *----------------------------------------------------------------------------*/
 727
 728 static inline flag extractFloatx80Sign( floatx80 a )
 729 {
 730
 731     return a.high>>15;
 732
 733 }
 734
 735 /*----------------------------------------------------------------------------
 736 | Normalizes the subnormal extended double-precision floating-point value
 737 | represented by the denormalized significand `aSig'.  The normalized exponent
 738 | and significand are stored at the locations pointed to by `zExpPtr' and
 739 | `zSigPtr', respectively.
 740 *----------------------------------------------------------------------------*/
 741
 742 static void
 743  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
 744 {
 745     int8_t shiftCount;
 746
 747     shiftCount = countLeadingZeros64( aSig );
 748     *zSigPtr = aSig<<shiftCount;
 749     *zExpPtr = 1 - shiftCount;
 750
 751 }
 752
 753 /*----------------------------------------------------------------------------
 754 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 755 | extended double-precision floating-point value, returning the result.
 756 *----------------------------------------------------------------------------*/
 757
 758 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
 759 {
 760     floatx80 z;
 761
 762     z.low = zSig;
 763     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 764     return z;
 765
 766 }
 767
 768 /*----------------------------------------------------------------------------
 769 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 770 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 771 | and returns the proper extended double-precision floating-point value
 772 | corresponding to the abstract input.  Ordinarily, the abstract value is
 773 | rounded and packed into the extended double-precision format, with the
 774 | inexact exception raised if the abstract input cannot be represented
 775 | exactly.  However, if the abstract value is too large, the overflow and
 776 | inexact exceptions are raised and an infinity or maximal finite value is
 777 | returned.  If the abstract value is too small, the input value is rounded to
 778 | a subnormal number, and the underflow and inexact exceptions are raised if
 779 | the abstract input cannot be represented exactly as a subnormal extended
 780 | double-precision floating-point number.
 781 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 782 | number of bits as single or double precision, respectively.  Otherwise, the
 783 | result is rounded to the full precision of the extended double-precision
 784 | format.
 785 |     The input significand must be normalized or smaller.  If the input
 786 | significand is not normalized, `zExp' must be 0; in that case, the result
 787 | returned is a subnormal number, and it must not require rounding.  The
 788 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 789 | Floating-Point Arithmetic.
 790 *----------------------------------------------------------------------------*/
 791
 792 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
 793                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
 794                                      float_status *status)
 795 {
 796     int8_t roundingMode;
 797     flag roundNearestEven, increment, isTiny;
 798     int64_t roundIncrement, roundMask, roundBits;
 799
 800     roundingMode = status->float_rounding_mode;
 801     roundNearestEven = ( roundingMode == float_round_nearest_even );
 802     if ( roundingPrecision == 80 ) goto precision80;
 803     if ( roundingPrecision == 64 ) {
 804         roundIncrement = LIT64( 0x0000000000000400 );
 805         roundMask = LIT64( 0x00000000000007FF );
 806     }
 807     else if ( roundingPrecision == 32 ) {
 808         roundIncrement = LIT64( 0x0000008000000000 );
 809         roundMask = LIT64( 0x000000FFFFFFFFFF );
 810     }
 811     else {
 812         goto precision80;
 813     }
 814     zSig0 |= ( zSig1 != 0 );
 815     switch (roundingMode) {
 816     case float_round_nearest_even:
 817     case float_round_ties_away:
 818         break;
 819     case float_round_to_zero:
 820         roundIncrement = 0;
 821         break;
 822     case float_round_up:
 823         roundIncrement = zSign ? 0 : roundMask;
 824         break;
 825     case float_round_down:
 826         roundIncrement = zSign ? roundMask : 0;
 827         break;
 828     default:
 829         abort();
 830     }
 831     roundBits = zSig0 & roundMask;
 832     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 833         if (    ( 0x7FFE < zExp )
 834              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 835            ) {
 836             goto overflow;
 837         }
 838         if ( zExp <= 0 ) {
 839             if (status->flush_to_zero) {
 840                 float_raise(float_flag_output_denormal, status);
 841                 return packFloatx80(zSign, 0, 0);
 842             }
 843             isTiny =
 844                    (status->float_detect_tininess
 845                     == float_tininess_before_rounding)
 846                 || ( zExp < 0 )
 847                 || ( zSig0 <= zSig0 + roundIncrement );
 848             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 849             zExp = 0;
 850             roundBits = zSig0 & roundMask;
 851             if (isTiny && roundBits) {
 852                 float_raise(float_flag_underflow, status);
 853             }
 854             if (roundBits) {
 855                 status->float_exception_flags |= float_flag_inexact;
 856             }
 857             zSig0 += roundIncrement;
 858             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 859             roundIncrement = roundMask + 1;
 860             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 861                 roundMask |= roundIncrement;
 862             }
 863             zSig0 &= ~ roundMask;
 864             return packFloatx80( zSign, zExp, zSig0 );
 865         }
 866     }
 867     if (roundBits) {
 868         status->float_exception_flags |= float_flag_inexact;
 869     }
 870     zSig0 += roundIncrement;
 871     if ( zSig0 < roundIncrement ) {
 872         ++zExp;
 873         zSig0 = LIT64( 0x8000000000000000 );
 874     }
 875     roundIncrement = roundMask + 1;
 876     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 877         roundMask |= roundIncrement;
 878     }
 879     zSig0 &= ~ roundMask;
 880     if ( zSig0 == 0 ) zExp = 0;
 881     return packFloatx80( zSign, zExp, zSig0 );
 882  precision80:
 883     switch (roundingMode) {
 884     case float_round_nearest_even:
 885     case float_round_ties_away:
 886         increment = ((int64_t)zSig1 < 0);
 887         break;
 888     case float_round_to_zero:
 889         increment = 0;
 890         break;
 891     case float_round_up:
 892         increment = !zSign && zSig1;
 893         break;
 894     case float_round_down:
 895         increment = zSign && zSig1;
 896         break;
 897     default:
 898         abort();
 899     }
 900     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 901         if (    ( 0x7FFE < zExp )
 902              || (    ( zExp == 0x7FFE )
 903                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 904                   && increment
 905                 )
 906            ) {
 907             roundMask = 0;
 908  overflow:
 909             float_raise(float_flag_overflow | float_flag_inexact, status);
 910             if (    ( roundingMode == float_round_to_zero )
 911                  || ( zSign && ( roundingMode == float_round_up ) )
 912                  || ( ! zSign && ( roundingMode == float_round_down ) )
 913                ) {
 914                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 915             }
 916             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 917         }
 918         if ( zExp <= 0 ) {
 919             isTiny =
 920                    (status->float_detect_tininess
 921                     == float_tininess_before_rounding)
 922                 || ( zExp < 0 )
 923                 || ! increment
 924                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 925             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 926             zExp = 0;
 927             if (isTiny && zSig1) {
 928                 float_raise(float_flag_underflow, status);
 929             }
 930             if (zSig1) {
 931                 status->float_exception_flags |= float_flag_inexact;
 932             }
 933             switch (roundingMode) {
 934             case float_round_nearest_even:
 935             case float_round_ties_away:
 936                 increment = ((int64_t)zSig1 < 0);
 937                 break;
 938             case float_round_to_zero:
 939                 increment = 0;
 940                 break;
 941             case float_round_up:
 942                 increment = !zSign && zSig1;
 943                 break;
 944             case float_round_down:
 945                 increment = zSign && zSig1;
 946                 break;
 947             default:
 948                 abort();
 949             }
 950             if ( increment ) {
 951                 ++zSig0;
 952                 zSig0 &=
 953                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 954                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 955             }
 956             return packFloatx80( zSign, zExp, zSig0 );
 957         }
 958     }
 959     if (zSig1) {
 960         status->float_exception_flags |= float_flag_inexact;
 961     }
 962     if ( increment ) {
 963         ++zSig0;
 964         if ( zSig0 == 0 ) {
 965             ++zExp;
 966             zSig0 = LIT64( 0x8000000000000000 );
 967         }
 968         else {
 969             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 970         }
 971     }
 972     else {
 973         if ( zSig0 == 0 ) zExp = 0;
 974     }
 975     return packFloatx80( zSign, zExp, zSig0 );
 976
 977 }
 978
 979 /*----------------------------------------------------------------------------
 980 | Takes an abstract floating-point value having sign `zSign', exponent
 981 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 982 | and returns the proper extended double-precision floating-point value
 983 | corresponding to the abstract input.  This routine is just like
 984 | `roundAndPackFloatx80' except that the input significand does not have to be
 985 | normalized.
 986 *----------------------------------------------------------------------------*/
 987
 988 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
 989                                               flag zSign, int32_t zExp,
 990                                               uint64_t zSig0, uint64_t zSig1,
 991                                               float_status *status)
 992 {
 993     int8_t shiftCount;
 994
 995     if ( zSig0 == 0 ) {
 996         zSig0 = zSig1;
 997         zSig1 = 0;
 998         zExp -= 64;
 999     }
1000     shiftCount = countLeadingZeros64( zSig0 );
1001     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1002     zExp -= shiftCount;
1003     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
1004                                 zSig0, zSig1, status);
1005
1006 }
1007
1008 /*----------------------------------------------------------------------------
1009 | Returns the least-significant 64 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1012
1013 static inline uint64_t extractFloat128Frac1( float128 a )
1014 {
1015
1016     return a.low;
1017
1018 }
1019
1020 /*----------------------------------------------------------------------------
1021 | Returns the most-significant 48 fraction bits of the quadruple-precision
1022 | floating-point value `a'.
1023 *----------------------------------------------------------------------------*/
1024
1025 static inline uint64_t extractFloat128Frac0( float128 a )
1026 {
1027
1028     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1029
1030 }
1031
1032 /*----------------------------------------------------------------------------
1033 | Returns the exponent bits of the quadruple-precision floating-point value
1034 | `a'.
1035 *----------------------------------------------------------------------------*/
1036
1037 static inline int32_t extractFloat128Exp( float128 a )
1038 {
1039
1040     return ( a.high>>48 ) & 0x7FFF;
1041
1042 }
1043
1044 /*----------------------------------------------------------------------------
1045 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1046 *----------------------------------------------------------------------------*/
1047
1048 static inline flag extractFloat128Sign( float128 a )
1049 {
1050
1051     return a.high>>63;
1052
1053 }
1054
1055 /*----------------------------------------------------------------------------
1056 | Normalizes the subnormal quadruple-precision floating-point value
1057 | represented by the denormalized significand formed by the concatenation of
1058 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1059 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1060 | significand are stored at the location pointed to by `zSig0Ptr', and the
1061 | least significant 64 bits of the normalized significand are stored at the
1062 | location pointed to by `zSig1Ptr'.
1063 *----------------------------------------------------------------------------*/
1064
1065 static void
1066  normalizeFloat128Subnormal(
1067      uint64_t aSig0,
1068      uint64_t aSig1,
1069      int32_t *zExpPtr,
1070      uint64_t *zSig0Ptr,
1071      uint64_t *zSig1Ptr
1072  )
1073 {
1074     int8_t shiftCount;
1075
1076     if ( aSig0 == 0 ) {
1077         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1078         if ( shiftCount < 0 ) {
1079             *zSig0Ptr = aSig1>>( - shiftCount );
1080             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1081         }
1082         else {
1083             *zSig0Ptr = aSig1<<shiftCount;
1084             *zSig1Ptr = 0;
1085         }
1086         *zExpPtr = - shiftCount - 63;
1087     }
1088     else {
1089         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1090         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1091         *zExpPtr = 1 - shiftCount;
1092     }
1093
1094 }
1095
1096 /*----------------------------------------------------------------------------
1097 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1098 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1099 | floating-point value, returning the result.  After being shifted into the
1100 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1101 | added together to form the most significant 32 bits of the result.  This
1102 | means that any integer portion of `zSig0' will be added into the exponent.
1103 | Since a properly normalized significand will have an integer portion equal
1104 | to 1, the `zExp' input should be 1 less than the desired result exponent
1105 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1106 | significand.
1107 *----------------------------------------------------------------------------*/
1108
1109 static inline float128
1110  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1111 {
1112     float128 z;
1113
1114     z.low = zSig1;
1115     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1116     return z;
1117
1118 }
1119
1120 /*----------------------------------------------------------------------------
1121 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1122 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1123 | and `zSig2', and returns the proper quadruple-precision floating-point value
1124 | corresponding to the abstract input.  Ordinarily, the abstract value is
1125 | simply rounded and packed into the quadruple-precision format, with the
1126 | inexact exception raised if the abstract input cannot be represented
1127 | exactly.  However, if the abstract value is too large, the overflow and
1128 | inexact exceptions are raised and an infinity or maximal finite value is
1129 | returned.  If the abstract value is too small, the input value is rounded to
1130 | a subnormal number, and the underflow and inexact exceptions are raised if
1131 | the abstract input cannot be represented exactly as a subnormal quadruple-
1132 | precision floating-point number.
1133 |     The input significand must be normalized or smaller.  If the input
1134 | significand is not normalized, `zExp' must be 0; in that case, the result
1135 | returned is a subnormal number, and it must not require rounding.  In the
1136 | usual case that the input significand is normalized, `zExp' must be 1 less
1137 | than the ``true'' floating-point exponent.  The handling of underflow and
1138 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140
1141 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1142                                      uint64_t zSig0, uint64_t zSig1,
1143                                      uint64_t zSig2, float_status *status)
1144 {
1145     int8_t roundingMode;
1146     flag roundNearestEven, increment, isTiny;
1147
1148     roundingMode = status->float_rounding_mode;
1149     roundNearestEven = ( roundingMode == float_round_nearest_even );
1150     switch (roundingMode) {
1151     case float_round_nearest_even:
1152     case float_round_ties_away:
1153         increment = ((int64_t)zSig2 < 0);
1154         break;
1155     case float_round_to_zero:
1156         increment = 0;
1157         break;
1158     case float_round_up:
1159         increment = !zSign && zSig2;
1160         break;
1161     case float_round_down:
1162         increment = zSign && zSig2;
1163         break;
1164     case float_round_to_odd:
1165         increment = !(zSig1 & 0x1) && zSig2;
1166         break;
1167     default:
1168         abort();
1169     }
1170     if ( 0x7FFD <= (uint32_t) zExp ) {
1171         if (    ( 0x7FFD < zExp )
1172              || (    ( zExp == 0x7FFD )
1173                   && eq128(
1174                          LIT64( 0x0001FFFFFFFFFFFF ),
1175                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1176                          zSig0,
1177                          zSig1
1178                      )
1179                   && increment
1180                 )
1181            ) {
1182             float_raise(float_flag_overflow | float_flag_inexact, status);
1183             if (    ( roundingMode == float_round_to_zero )
1184                  || ( zSign && ( roundingMode == float_round_up ) )
1185                  || ( ! zSign && ( roundingMode == float_round_down ) )
1186                  || (roundingMode == float_round_to_odd)
1187                ) {
1188                 return
1189                     packFloat128(
1190                         zSign,
1191                         0x7FFE,
1192                         LIT64( 0x0000FFFFFFFFFFFF ),
1193                         LIT64( 0xFFFFFFFFFFFFFFFF )
1194                     );
1195             }
1196             return packFloat128( zSign, 0x7FFF, 0, 0 );
1197         }
1198         if ( zExp < 0 ) {
1199             if (status->flush_to_zero) {
1200                 float_raise(float_flag_output_denormal, status);
1201                 return packFloat128(zSign, 0, 0, 0);
1202             }
1203             isTiny =
1204                    (status->float_detect_tininess
1205                     == float_tininess_before_rounding)
1206                 || ( zExp < -1 )
1207                 || ! increment
1208                 || lt128(
1209                        zSig0,
1210                        zSig1,
1211                        LIT64( 0x0001FFFFFFFFFFFF ),
1212                        LIT64( 0xFFFFFFFFFFFFFFFF )
1213                    );
1214             shift128ExtraRightJamming(
1215                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1216             zExp = 0;
1217             if (isTiny && zSig2) {
1218                 float_raise(float_flag_underflow, status);
1219             }
1220             switch (roundingMode) {
1221             case float_round_nearest_even:
1222             case float_round_ties_away:
1223                 increment = ((int64_t)zSig2 < 0);
1224                 break;
1225             case float_round_to_zero:
1226                 increment = 0;
1227                 break;
1228             case float_round_up:
1229                 increment = !zSign && zSig2;
1230                 break;
1231             case float_round_down:
1232                 increment = zSign && zSig2;
1233                 break;
1234             case float_round_to_odd:
1235                 increment = !(zSig1 & 0x1) && zSig2;
1236                 break;
1237             default:
1238                 abort();
1239             }
1240         }
1241     }
1242     if (zSig2) {
1243         status->float_exception_flags |= float_flag_inexact;
1244     }
1245     if ( increment ) {
1246         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1247         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1248     }
1249     else {
1250         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1251     }
1252     return packFloat128( zSign, zExp, zSig0, zSig1 );
1253
1254 }
1255
1256 /*----------------------------------------------------------------------------
1257 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1258 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1259 | returns the proper quadruple-precision floating-point value corresponding
1260 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1261 | except that the input significand has fewer bits and does not have to be
1262 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1263 | point exponent.
1264 *----------------------------------------------------------------------------*/
1265
1266 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1267                                               uint64_t zSig0, uint64_t zSig1,
1268                                               float_status *status)
1269 {
1270     int8_t shiftCount;
1271     uint64_t zSig2;
1272
1273     if ( zSig0 == 0 ) {
1274         zSig0 = zSig1;
1275         zSig1 = 0;
1276         zExp -= 64;
1277     }
1278     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1279     if ( 0 <= shiftCount ) {
1280         zSig2 = 0;
1281         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1282     }
1283     else {
1284         shift128ExtraRightJamming(
1285             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1286     }
1287     zExp -= shiftCount;
1288     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1289
1290 }
1291
1292 /*----------------------------------------------------------------------------
1293 | Returns the result of converting the 32-bit two's complement integer `a'
1294 | to the single-precision floating-point format.  The conversion is performed
1295 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 *----------------------------------------------------------------------------*/
1297
1298 float32 int32_to_float32(int32_t a, float_status *status)
1299 {
1300     flag zSign;
1301
1302     if ( a == 0 ) return float32_zero;
1303     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1304     zSign = ( a < 0 );
1305     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1306 }
1307
1308 /*----------------------------------------------------------------------------
1309 | Returns the result of converting the 32-bit two's complement integer `a'
1310 | to the double-precision floating-point format.  The conversion is performed
1311 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1312 *----------------------------------------------------------------------------*/
1313
1314 float64 int32_to_float64(int32_t a, float_status *status)
1315 {
1316     flag zSign;
1317     uint32_t absA;
1318     int8_t shiftCount;
1319     uint64_t zSig;
1320
1321     if ( a == 0 ) return float64_zero;
1322     zSign = ( a < 0 );
1323     absA = zSign ? - a : a;
1324     shiftCount = countLeadingZeros32( absA ) + 21;
1325     zSig = absA;
1326     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1327
1328 }
1329
1330 /*----------------------------------------------------------------------------
1331 | Returns the result of converting the 32-bit two's complement integer `a'
1332 | to the extended double-precision floating-point format.  The conversion
1333 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1334 | Arithmetic.
1335 *----------------------------------------------------------------------------*/
1336
1337 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1338 {
1339     flag zSign;
1340     uint32_t absA;
1341     int8_t shiftCount;
1342     uint64_t zSig;
1343
1344     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1345     zSign = ( a < 0 );
1346     absA = zSign ? - a : a;
1347     shiftCount = countLeadingZeros32( absA ) + 32;
1348     zSig = absA;
1349     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1350
1351 }
1352
1353 /*----------------------------------------------------------------------------
1354 | Returns the result of converting the 32-bit two's complement integer `a' to
1355 | the quadruple-precision floating-point format.  The conversion is performed
1356 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1357 *----------------------------------------------------------------------------*/
1358
1359 float128 int32_to_float128(int32_t a, float_status *status)
1360 {
1361     flag zSign;
1362     uint32_t absA;
1363     int8_t shiftCount;
1364     uint64_t zSig0;
1365
1366     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1367     zSign = ( a < 0 );
1368     absA = zSign ? - a : a;
1369     shiftCount = countLeadingZeros32( absA ) + 17;
1370     zSig0 = absA;
1371     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1372
1373 }
1374
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the 64-bit two's complement integer `a'
1377 | to the single-precision floating-point format.  The conversion is performed
1378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1379 *----------------------------------------------------------------------------*/
1380
1381 float32 int64_to_float32(int64_t a, float_status *status)
1382 {
1383     flag zSign;
1384     uint64_t absA;
1385     int8_t shiftCount;
1386
1387     if ( a == 0 ) return float32_zero;
1388     zSign = ( a < 0 );
1389     absA = zSign ? - a : a;
1390     shiftCount = countLeadingZeros64( absA ) - 40;
1391     if ( 0 <= shiftCount ) {
1392         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1393     }
1394     else {
1395         shiftCount += 7;
1396         if ( shiftCount < 0 ) {
1397             shift64RightJamming( absA, - shiftCount, &absA );
1398         }
1399         else {
1400             absA <<= shiftCount;
1401         }
1402         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1403     }
1404
1405 }
1406
1407 /*----------------------------------------------------------------------------
1408 | Returns the result of converting the 64-bit two's complement integer `a'
1409 | to the double-precision floating-point format.  The conversion is performed
1410 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1411 *----------------------------------------------------------------------------*/
1412
1413 float64 int64_to_float64(int64_t a, float_status *status)
1414 {
1415     flag zSign;
1416
1417     if ( a == 0 ) return float64_zero;
1418     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1419         return packFloat64( 1, 0x43E, 0 );
1420     }
1421     zSign = ( a < 0 );
1422     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1423 }
1424
1425 /*----------------------------------------------------------------------------
1426 | Returns the result of converting the 64-bit two's complement integer `a'
1427 | to the extended double-precision floating-point format.  The conversion
1428 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1429 | Arithmetic.
1430 *----------------------------------------------------------------------------*/
1431
1432 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1433 {
1434     flag zSign;
1435     uint64_t absA;
1436     int8_t shiftCount;
1437
1438     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1439     zSign = ( a < 0 );
1440     absA = zSign ? - a : a;
1441     shiftCount = countLeadingZeros64( absA );
1442     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1443
1444 }
1445
1446 /*----------------------------------------------------------------------------
1447 | Returns the result of converting the 64-bit two's complement integer `a' to
1448 | the quadruple-precision floating-point format.  The conversion is performed
1449 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1450 *----------------------------------------------------------------------------*/
1451
1452 float128 int64_to_float128(int64_t a, float_status *status)
1453 {
1454     flag zSign;
1455     uint64_t absA;
1456     int8_t shiftCount;
1457     int32_t zExp;
1458     uint64_t zSig0, zSig1;
1459
1460     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1461     zSign = ( a < 0 );
1462     absA = zSign ? - a : a;
1463     shiftCount = countLeadingZeros64( absA ) + 49;
1464     zExp = 0x406E - shiftCount;
1465     if ( 64 <= shiftCount ) {
1466         zSig1 = 0;
1467         zSig0 = absA;
1468         shiftCount -= 64;
1469     }
1470     else {
1471         zSig1 = absA;
1472         zSig0 = 0;
1473     }
1474     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1475     return packFloat128( zSign, zExp, zSig0, zSig1 );
1476
1477 }
1478
1479 /*----------------------------------------------------------------------------
1480 | Returns the result of converting the 64-bit unsigned integer `a'
1481 | to the single-precision floating-point format.  The conversion is performed
1482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1483 *----------------------------------------------------------------------------*/
1484
1485 float32 uint64_to_float32(uint64_t a, float_status *status)
1486 {
1487     int shiftcount;
1488
1489     if (a == 0) {
1490         return float32_zero;
1491     }
1492
1493     /* Determine (left) shift needed to put first set bit into bit posn 23
1494      * (since packFloat32() expects the binary point between bits 23 and 22);
1495      * this is the fast case for smallish numbers.
1496      */
1497     shiftcount = countLeadingZeros64(a) - 40;
1498     if (shiftcount >= 0) {
1499         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1500     }
1501     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1502      * expects the binary point between bits 30 and 29, hence the + 7.
1503      */
1504     shiftcount += 7;
1505     if (shiftcount < 0) {
1506         shift64RightJamming(a, -shiftcount, &a);
1507     } else {
1508         a <<= shiftcount;
1509     }
1510
1511     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1512 }
1513
1514 /*----------------------------------------------------------------------------
1515 | Returns the result of converting the 64-bit unsigned integer `a'
1516 | to the double-precision floating-point format.  The conversion is performed
1517 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1518 *----------------------------------------------------------------------------*/
1519
1520 float64 uint64_to_float64(uint64_t a, float_status *status)
1521 {
1522     int exp = 0x43C;
1523     int shiftcount;
1524
1525     if (a == 0) {
1526         return float64_zero;
1527     }
1528
1529     shiftcount = countLeadingZeros64(a) - 1;
1530     if (shiftcount < 0) {
1531         shift64RightJamming(a, -shiftcount, &a);
1532     } else {
1533         a <<= shiftcount;
1534     }
1535     return roundAndPackFloat64(0, exp - shiftcount, a, status);
1536 }
1537
1538 /*----------------------------------------------------------------------------
1539 | Returns the result of converting the 64-bit unsigned integer `a'
1540 | to the quadruple-precision floating-point format.  The conversion is performed
1541 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1542 *----------------------------------------------------------------------------*/
1543
1544 float128 uint64_to_float128(uint64_t a, float_status *status)
1545 {
1546     if (a == 0) {
1547         return float128_zero;
1548     }
1549     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1550 }
1551
1552 /*----------------------------------------------------------------------------
1553 | Returns the result of converting the single-precision floating-point value
1554 | `a' to the 32-bit two's complement integer format.  The conversion is
1555 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1556 | Arithmetic---which means in particular that the conversion is rounded
1557 | according to the current rounding mode.  If `a' is a NaN, the largest
1558 | positive integer is returned.  Otherwise, if the conversion overflows, the
1559 | largest integer with the same sign as `a' is returned.
1560 *----------------------------------------------------------------------------*/
1561
1562 int32_t float32_to_int32(float32 a, float_status *status)
1563 {
1564     flag aSign;
1565     int aExp;
1566     int shiftCount;
1567     uint32_t aSig;
1568     uint64_t aSig64;
1569
1570     a = float32_squash_input_denormal(a, status);
1571     aSig = extractFloat32Frac( a );
1572     aExp = extractFloat32Exp( a );
1573     aSign = extractFloat32Sign( a );
1574     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1575     if ( aExp ) aSig |= 0x00800000;
1576     shiftCount = 0xAF - aExp;
1577     aSig64 = aSig;
1578     aSig64 <<= 32;
1579     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1580     return roundAndPackInt32(aSign, aSig64, status);
1581
1582 }
1583
1584 /*----------------------------------------------------------------------------
1585 | Returns the result of converting the single-precision floating-point value
1586 | `a' to the 32-bit two's complement integer format.  The conversion is
1587 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1588 | Arithmetic, except that the conversion is always rounded toward zero.
1589 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1590 | the conversion overflows, the largest integer with the same sign as `a' is
1591 | returned.
1592 *----------------------------------------------------------------------------*/
1593
1594 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1595 {
1596     flag aSign;
1597     int aExp;
1598     int shiftCount;
1599     uint32_t aSig;
1600     int32_t z;
1601     a = float32_squash_input_denormal(a, status);
1602
1603     aSig = extractFloat32Frac( a );
1604     aExp = extractFloat32Exp( a );
1605     aSign = extractFloat32Sign( a );
1606     shiftCount = aExp - 0x9E;
1607     if ( 0 <= shiftCount ) {
1608         if ( float32_val(a) != 0xCF000000 ) {
1609             float_raise(float_flag_invalid, status);
1610             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1611         }
1612         return (int32_t) 0x80000000;
1613     }
1614     else if ( aExp <= 0x7E ) {
1615         if (aExp | aSig) {
1616             status->float_exception_flags |= float_flag_inexact;
1617         }
1618         return 0;
1619     }
1620     aSig = ( aSig | 0x00800000 )<<8;
1621     z = aSig>>( - shiftCount );
1622     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1623         status->float_exception_flags |= float_flag_inexact;
1624     }
1625     if ( aSign ) z = - z;
1626     return z;
1627
1628 }
1629
1630 /*----------------------------------------------------------------------------
1631 | Returns the result of converting the single-precision floating-point value
1632 | `a' to the 16-bit two's complement integer format.  The conversion is
1633 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1634 | Arithmetic, except that the conversion is always rounded toward zero.
1635 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1636 | the conversion overflows, the largest integer with the same sign as `a' is
1637 | returned.
1638 *----------------------------------------------------------------------------*/
1639
1640 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1641 {
1642     flag aSign;
1643     int aExp;
1644     int shiftCount;
1645     uint32_t aSig;
1646     int32_t z;
1647
1648     aSig = extractFloat32Frac( a );
1649     aExp = extractFloat32Exp( a );
1650     aSign = extractFloat32Sign( a );
1651     shiftCount = aExp - 0x8E;
1652     if ( 0 <= shiftCount ) {
1653         if ( float32_val(a) != 0xC7000000 ) {
1654             float_raise(float_flag_invalid, status);
1655             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1656                 return 0x7FFF;
1657             }
1658         }
1659         return (int32_t) 0xffff8000;
1660     }
1661     else if ( aExp <= 0x7E ) {
1662         if ( aExp | aSig ) {
1663             status->float_exception_flags |= float_flag_inexact;
1664         }
1665         return 0;
1666     }
1667     shiftCount -= 0x10;
1668     aSig = ( aSig | 0x00800000 )<<8;
1669     z = aSig>>( - shiftCount );
1670     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1671         status->float_exception_flags |= float_flag_inexact;
1672     }
1673     if ( aSign ) {
1674         z = - z;
1675     }
1676     return z;
1677
1678 }
1679
1680 /*----------------------------------------------------------------------------
1681 | Returns the result of converting the single-precision floating-point value
1682 | `a' to the 64-bit two's complement integer format.  The conversion is
1683 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1684 | Arithmetic---which means in particular that the conversion is rounded
1685 | according to the current rounding mode.  If `a' is a NaN, the largest
1686 | positive integer is returned.  Otherwise, if the conversion overflows, the
1687 | largest integer with the same sign as `a' is returned.
1688 *----------------------------------------------------------------------------*/
1689
1690 int64_t float32_to_int64(float32 a, float_status *status)
1691 {
1692     flag aSign;
1693     int aExp;
1694     int shiftCount;
1695     uint32_t aSig;
1696     uint64_t aSig64, aSigExtra;
1697     a = float32_squash_input_denormal(a, status);
1698
1699     aSig = extractFloat32Frac( a );
1700     aExp = extractFloat32Exp( a );
1701     aSign = extractFloat32Sign( a );
1702     shiftCount = 0xBE - aExp;
1703     if ( shiftCount < 0 ) {
1704         float_raise(float_flag_invalid, status);
1705         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1706             return LIT64( 0x7FFFFFFFFFFFFFFF );
1707         }
1708         return (int64_t) LIT64( 0x8000000000000000 );
1709     }
1710     if ( aExp ) aSig |= 0x00800000;
1711     aSig64 = aSig;
1712     aSig64 <<= 40;
1713     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1714     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1715
1716 }
1717
1718 /*----------------------------------------------------------------------------
1719 | Returns the result of converting the single-precision floating-point value
1720 | `a' to the 64-bit unsigned integer format.  The conversion is
1721 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1722 | Arithmetic---which means in particular that the conversion is rounded
1723 | according to the current rounding mode.  If `a' is a NaN, the largest
1724 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1725 | largest unsigned integer is returned.  If the 'a' is negative, the result
1726 | is rounded and zero is returned; values that do not round to zero will
1727 | raise the inexact exception flag.
1728 *----------------------------------------------------------------------------*/
1729
1730 uint64_t float32_to_uint64(float32 a, float_status *status)
1731 {
1732     flag aSign;
1733     int aExp;
1734     int shiftCount;
1735     uint32_t aSig;
1736     uint64_t aSig64, aSigExtra;
1737     a = float32_squash_input_denormal(a, status);
1738
1739     aSig = extractFloat32Frac(a);
1740     aExp = extractFloat32Exp(a);
1741     aSign = extractFloat32Sign(a);
1742     if ((aSign) && (aExp > 126)) {
1743         float_raise(float_flag_invalid, status);
1744         if (float32_is_any_nan(a)) {
1745             return LIT64(0xFFFFFFFFFFFFFFFF);
1746         } else {
1747             return 0;
1748         }
1749     }
1750     shiftCount = 0xBE - aExp;
1751     if (aExp) {
1752         aSig |= 0x00800000;
1753     }
1754     if (shiftCount < 0) {
1755         float_raise(float_flag_invalid, status);
1756         return LIT64(0xFFFFFFFFFFFFFFFF);
1757     }
1758
1759     aSig64 = aSig;
1760     aSig64 <<= 40;
1761     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1762     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1763 }
1764
1765 /*----------------------------------------------------------------------------
1766 | Returns the result of converting the single-precision floating-point value
1767 | `a' to the 64-bit unsigned integer format.  The conversion is
1768 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1769 | Arithmetic, except that the conversion is always rounded toward zero.  If
1770 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1771 | conversion overflows, the largest unsigned integer is returned.  If the
1772 | 'a' is negative, the result is rounded and zero is returned; values that do
1773 | not round to zero will raise the inexact flag.
1774 *----------------------------------------------------------------------------*/
1775
1776 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1777 {
1778     signed char current_rounding_mode = status->float_rounding_mode;
1779     set_float_rounding_mode(float_round_to_zero, status);
1780     int64_t v = float32_to_uint64(a, status);
1781     set_float_rounding_mode(current_rounding_mode, status);
1782     return v;
1783 }
1784
1785 /*----------------------------------------------------------------------------
1786 | Returns the result of converting the single-precision floating-point value
1787 | `a' to the 64-bit two's complement integer format.  The conversion is
1788 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1789 | Arithmetic, except that the conversion is always rounded toward zero.  If
1790 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1791 | conversion overflows, the largest integer with the same sign as `a' is
1792 | returned.
1793 *----------------------------------------------------------------------------*/
1794
1795 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1796 {
1797     flag aSign;
1798     int aExp;
1799     int shiftCount;
1800     uint32_t aSig;
1801     uint64_t aSig64;
1802     int64_t z;
1803     a = float32_squash_input_denormal(a, status);
1804
1805     aSig = extractFloat32Frac( a );
1806     aExp = extractFloat32Exp( a );
1807     aSign = extractFloat32Sign( a );
1808     shiftCount = aExp - 0xBE;
1809     if ( 0 <= shiftCount ) {
1810         if ( float32_val(a) != 0xDF000000 ) {
1811             float_raise(float_flag_invalid, status);
1812             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1813                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1814             }
1815         }
1816         return (int64_t) LIT64( 0x8000000000000000 );
1817     }
1818     else if ( aExp <= 0x7E ) {
1819         if (aExp | aSig) {
1820             status->float_exception_flags |= float_flag_inexact;
1821         }
1822         return 0;
1823     }
1824     aSig64 = aSig | 0x00800000;
1825     aSig64 <<= 40;
1826     z = aSig64>>( - shiftCount );
1827     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1828         status->float_exception_flags |= float_flag_inexact;
1829     }
1830     if ( aSign ) z = - z;
1831     return z;
1832
1833 }
1834
1835 /*----------------------------------------------------------------------------
1836 | Returns the result of converting the single-precision floating-point value
1837 | `a' to the double-precision floating-point format.  The conversion is
1838 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1839 | Arithmetic.
1840 *----------------------------------------------------------------------------*/
1841
1842 float64 float32_to_float64(float32 a, float_status *status)
1843 {
1844     flag aSign;
1845     int aExp;
1846     uint32_t aSig;
1847     a = float32_squash_input_denormal(a, status);
1848
1849     aSig = extractFloat32Frac( a );
1850     aExp = extractFloat32Exp( a );
1851     aSign = extractFloat32Sign( a );
1852     if ( aExp == 0xFF ) {
1853         if (aSig) {
1854             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1855         }
1856         return packFloat64( aSign, 0x7FF, 0 );
1857     }
1858     if ( aExp == 0 ) {
1859         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1860         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1861         --aExp;
1862     }
1863     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1864
1865 }
1866
1867 /*----------------------------------------------------------------------------
1868 | Returns the result of converting the single-precision floating-point value
1869 | `a' to the extended double-precision floating-point format.  The conversion
1870 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1871 | Arithmetic.
1872 *----------------------------------------------------------------------------*/
1873
1874 floatx80 float32_to_floatx80(float32 a, float_status *status)
1875 {
1876     flag aSign;
1877     int aExp;
1878     uint32_t aSig;
1879
1880     a = float32_squash_input_denormal(a, status);
1881     aSig = extractFloat32Frac( a );
1882     aExp = extractFloat32Exp( a );
1883     aSign = extractFloat32Sign( a );
1884     if ( aExp == 0xFF ) {
1885         if (aSig) {
1886             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1887         }
1888         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1889     }
1890     if ( aExp == 0 ) {
1891         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1892         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1893     }
1894     aSig |= 0x00800000;
1895     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1896
1897 }
1898
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of converting the single-precision floating-point value
1901 | `a' to the double-precision floating-point format.  The conversion is
1902 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1903 | Arithmetic.
1904 *----------------------------------------------------------------------------*/
1905
1906 float128 float32_to_float128(float32 a, float_status *status)
1907 {
1908     flag aSign;
1909     int aExp;
1910     uint32_t aSig;
1911
1912     a = float32_squash_input_denormal(a, status);
1913     aSig = extractFloat32Frac( a );
1914     aExp = extractFloat32Exp( a );
1915     aSign = extractFloat32Sign( a );
1916     if ( aExp == 0xFF ) {
1917         if (aSig) {
1918             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1919         }
1920         return packFloat128( aSign, 0x7FFF, 0, 0 );
1921     }
1922     if ( aExp == 0 ) {
1923         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1924         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1925         --aExp;
1926     }
1927     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1928
1929 }
1930
1931 /*----------------------------------------------------------------------------
1932 | Rounds the single-precision floating-point value `a' to an integer, and
1933 | returns the result as a single-precision floating-point value.  The
1934 | operation is performed according to the IEC/IEEE Standard for Binary
1935 | Floating-Point Arithmetic.
1936 *----------------------------------------------------------------------------*/
1937
1938 float32 float32_round_to_int(float32 a, float_status *status)
1939 {
1940     flag aSign;
1941     int aExp;
1942     uint32_t lastBitMask, roundBitsMask;
1943     uint32_t z;
1944     a = float32_squash_input_denormal(a, status);
1945
1946     aExp = extractFloat32Exp( a );
1947     if ( 0x96 <= aExp ) {
1948         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1949             return propagateFloat32NaN(a, a, status);
1950         }
1951         return a;
1952     }
1953     if ( aExp <= 0x7E ) {
1954         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1955         status->float_exception_flags |= float_flag_inexact;
1956         aSign = extractFloat32Sign( a );
1957         switch (status->float_rounding_mode) {
1958          case float_round_nearest_even:
1959             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1960                 return packFloat32( aSign, 0x7F, 0 );
1961             }
1962             break;
1963         case float_round_ties_away:
1964             if (aExp == 0x7E) {
1965                 return packFloat32(aSign, 0x7F, 0);
1966             }
1967             break;
1968          case float_round_down:
1969             return make_float32(aSign ? 0xBF800000 : 0);
1970          case float_round_up:
1971             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1972         }
1973         return packFloat32( aSign, 0, 0 );
1974     }
1975     lastBitMask = 1;
1976     lastBitMask <<= 0x96 - aExp;
1977     roundBitsMask = lastBitMask - 1;
1978     z = float32_val(a);
1979     switch (status->float_rounding_mode) {
1980     case float_round_nearest_even:
1981         z += lastBitMask>>1;
1982         if ((z & roundBitsMask) == 0) {
1983             z &= ~lastBitMask;
1984         }
1985         break;
1986     case float_round_ties_away:
1987         z += lastBitMask >> 1;
1988         break;
1989     case float_round_to_zero:
1990         break;
1991     case float_round_up:
1992         if (!extractFloat32Sign(make_float32(z))) {
1993             z += roundBitsMask;
1994         }
1995         break;
1996     case float_round_down:
1997         if (extractFloat32Sign(make_float32(z))) {
1998             z += roundBitsMask;
1999         }
2000         break;
2001     default:
2002         abort();
2003     }
2004     z &= ~ roundBitsMask;
2005     if (z != float32_val(a)) {
2006         status->float_exception_flags |= float_flag_inexact;
2007     }
2008     return make_float32(z);
2009
2010 }
2011
2012 /*----------------------------------------------------------------------------
2013 | Returns the result of adding the absolute values of the single-precision
2014 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2015 | before being returned.  `zSign' is ignored if the result is a NaN.
2016 | The addition is performed according to the IEC/IEEE Standard for Binary
2017 | Floating-Point Arithmetic.
2018 *----------------------------------------------------------------------------*/
2019
2020 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2021                               float_status *status)
2022 {
2023     int aExp, bExp, zExp;
2024     uint32_t aSig, bSig, zSig;
2025     int expDiff;
2026
2027     aSig = extractFloat32Frac( a );
2028     aExp = extractFloat32Exp( a );
2029     bSig = extractFloat32Frac( b );
2030     bExp = extractFloat32Exp( b );
2031     expDiff = aExp - bExp;
2032     aSig <<= 6;
2033     bSig <<= 6;
2034     if ( 0 < expDiff ) {
2035         if ( aExp == 0xFF ) {
2036             if (aSig) {
2037                 return propagateFloat32NaN(a, b, status);
2038             }
2039             return a;
2040         }
2041         if ( bExp == 0 ) {
2042             --expDiff;
2043         }
2044         else {
2045             bSig |= 0x20000000;
2046         }
2047         shift32RightJamming( bSig, expDiff, &bSig );
2048         zExp = aExp;
2049     }
2050     else if ( expDiff < 0 ) {
2051         if ( bExp == 0xFF ) {
2052             if (bSig) {
2053                 return propagateFloat32NaN(a, b, status);
2054             }
2055             return packFloat32( zSign, 0xFF, 0 );
2056         }
2057         if ( aExp == 0 ) {
2058             ++expDiff;
2059         }
2060         else {
2061             aSig |= 0x20000000;
2062         }
2063         shift32RightJamming( aSig, - expDiff, &aSig );
2064         zExp = bExp;
2065     }
2066     else {
2067         if ( aExp == 0xFF ) {
2068             if (aSig | bSig) {
2069                 return propagateFloat32NaN(a, b, status);
2070             }
2071             return a;
2072         }
2073         if ( aExp == 0 ) {
2074             if (status->flush_to_zero) {
2075                 if (aSig | bSig) {
2076                     float_raise(float_flag_output_denormal, status);
2077                 }
2078                 return packFloat32(zSign, 0, 0);
2079             }
2080             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2081         }
2082         zSig = 0x40000000 + aSig + bSig;
2083         zExp = aExp;
2084         goto roundAndPack;
2085     }
2086     aSig |= 0x20000000;
2087     zSig = ( aSig + bSig )<<1;
2088     --zExp;
2089     if ( (int32_t) zSig < 0 ) {
2090         zSig = aSig + bSig;
2091         ++zExp;
2092     }
2093  roundAndPack:
2094     return roundAndPackFloat32(zSign, zExp, zSig, status);
2095
2096 }
2097
2098 /*----------------------------------------------------------------------------
2099 | Returns the result of subtracting the absolute values of the single-
2100 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2101 | difference is negated before being returned.  `zSign' is ignored if the
2102 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2103 | Standard for Binary Floating-Point Arithmetic.
2104 *----------------------------------------------------------------------------*/
2105
2106 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2107                               float_status *status)
2108 {
2109     int aExp, bExp, zExp;
2110     uint32_t aSig, bSig, zSig;
2111     int expDiff;
2112
2113     aSig = extractFloat32Frac( a );
2114     aExp = extractFloat32Exp( a );
2115     bSig = extractFloat32Frac( b );
2116     bExp = extractFloat32Exp( b );
2117     expDiff = aExp - bExp;
2118     aSig <<= 7;
2119     bSig <<= 7;
2120     if ( 0 < expDiff ) goto aExpBigger;
2121     if ( expDiff < 0 ) goto bExpBigger;
2122     if ( aExp == 0xFF ) {
2123         if (aSig | bSig) {
2124             return propagateFloat32NaN(a, b, status);
2125         }
2126         float_raise(float_flag_invalid, status);
2127         return float32_default_nan(status);
2128     }
2129     if ( aExp == 0 ) {
2130         aExp = 1;
2131         bExp = 1;
2132     }
2133     if ( bSig < aSig ) goto aBigger;
2134     if ( aSig < bSig ) goto bBigger;
2135     return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2136  bExpBigger:
2137     if ( bExp == 0xFF ) {
2138         if (bSig) {
2139             return propagateFloat32NaN(a, b, status);
2140         }
2141         return packFloat32( zSign ^ 1, 0xFF, 0 );
2142     }
2143     if ( aExp == 0 ) {
2144         ++expDiff;
2145     }
2146     else {
2147         aSig |= 0x40000000;
2148     }
2149     shift32RightJamming( aSig, - expDiff, &aSig );
2150     bSig |= 0x40000000;
2151  bBigger:
2152     zSig = bSig - aSig;
2153     zExp = bExp;
2154     zSign ^= 1;
2155     goto normalizeRoundAndPack;
2156  aExpBigger:
2157     if ( aExp == 0xFF ) {
2158         if (aSig) {
2159             return propagateFloat32NaN(a, b, status);
2160         }
2161         return a;
2162     }
2163     if ( bExp == 0 ) {
2164         --expDiff;
2165     }
2166     else {
2167         bSig |= 0x40000000;
2168     }
2169     shift32RightJamming( bSig, expDiff, &bSig );
2170     aSig |= 0x40000000;
2171  aBigger:
2172     zSig = aSig - bSig;
2173     zExp = aExp;
2174  normalizeRoundAndPack:
2175     --zExp;
2176     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2177
2178 }
2179
2180 /*----------------------------------------------------------------------------
2181 | Returns the result of adding the single-precision floating-point values `a'
2182 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2183 | Binary Floating-Point Arithmetic.
2184 *----------------------------------------------------------------------------*/
2185
2186 float32 float32_add(float32 a, float32 b, float_status *status)
2187 {
2188     flag aSign, bSign;
2189     a = float32_squash_input_denormal(a, status);
2190     b = float32_squash_input_denormal(b, status);
2191
2192     aSign = extractFloat32Sign( a );
2193     bSign = extractFloat32Sign( b );
2194     if ( aSign == bSign ) {
2195         return addFloat32Sigs(a, b, aSign, status);
2196     }
2197     else {
2198         return subFloat32Sigs(a, b, aSign, status);
2199     }
2200
2201 }
2202
2203 /*----------------------------------------------------------------------------
2204 | Returns the result of subtracting the single-precision floating-point values
2205 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2206 | for Binary Floating-Point Arithmetic.
2207 *----------------------------------------------------------------------------*/
2208
2209 float32 float32_sub(float32 a, float32 b, float_status *status)
2210 {
2211     flag aSign, bSign;
2212     a = float32_squash_input_denormal(a, status);
2213     b = float32_squash_input_denormal(b, status);
2214
2215     aSign = extractFloat32Sign( a );
2216     bSign = extractFloat32Sign( b );
2217     if ( aSign == bSign ) {
2218         return subFloat32Sigs(a, b, aSign, status);
2219     }
2220     else {
2221         return addFloat32Sigs(a, b, aSign, status);
2222     }
2223
2224 }
2225
2226 /*----------------------------------------------------------------------------
2227 | Returns the result of multiplying the single-precision floating-point values
2228 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2229 | for Binary Floating-Point Arithmetic.
2230 *----------------------------------------------------------------------------*/
2231
2232 float32 float32_mul(float32 a, float32 b, float_status *status)
2233 {
2234     flag aSign, bSign, zSign;
2235     int aExp, bExp, zExp;
2236     uint32_t aSig, bSig;
2237     uint64_t zSig64;
2238     uint32_t zSig;
2239
2240     a = float32_squash_input_denormal(a, status);
2241     b = float32_squash_input_denormal(b, status);
2242
2243     aSig = extractFloat32Frac( a );
2244     aExp = extractFloat32Exp( a );
2245     aSign = extractFloat32Sign( a );
2246     bSig = extractFloat32Frac( b );
2247     bExp = extractFloat32Exp( b );
2248     bSign = extractFloat32Sign( b );
2249     zSign = aSign ^ bSign;
2250     if ( aExp == 0xFF ) {
2251         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2252             return propagateFloat32NaN(a, b, status);
2253         }
2254         if ( ( bExp | bSig ) == 0 ) {
2255             float_raise(float_flag_invalid, status);
2256             return float32_default_nan(status);
2257         }
2258         return packFloat32( zSign, 0xFF, 0 );
2259     }
2260     if ( bExp == 0xFF ) {
2261         if (bSig) {
2262             return propagateFloat32NaN(a, b, status);
2263         }
2264         if ( ( aExp | aSig ) == 0 ) {
2265             float_raise(float_flag_invalid, status);
2266             return float32_default_nan(status);
2267         }
2268         return packFloat32( zSign, 0xFF, 0 );
2269     }
2270     if ( aExp == 0 ) {
2271         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2272         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2273     }
2274     if ( bExp == 0 ) {
2275         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2276         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2277     }
2278     zExp = aExp + bExp - 0x7F;
2279     aSig = ( aSig | 0x00800000 )<<7;
2280     bSig = ( bSig | 0x00800000 )<<8;
2281     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2282     zSig = zSig64;
2283     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2284         zSig <<= 1;
2285         --zExp;
2286     }
2287     return roundAndPackFloat32(zSign, zExp, zSig, status);
2288
2289 }
2290
2291 /*----------------------------------------------------------------------------
2292 | Returns the result of dividing the single-precision floating-point value `a'
2293 | by the corresponding value `b'.  The operation is performed according to the
2294 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2295 *----------------------------------------------------------------------------*/
2296
2297 float32 float32_div(float32 a, float32 b, float_status *status)
2298 {
2299     flag aSign, bSign, zSign;
2300     int aExp, bExp, zExp;
2301     uint32_t aSig, bSig, zSig;
2302     a = float32_squash_input_denormal(a, status);
2303     b = float32_squash_input_denormal(b, status);
2304
2305     aSig = extractFloat32Frac( a );
2306     aExp = extractFloat32Exp( a );
2307     aSign = extractFloat32Sign( a );
2308     bSig = extractFloat32Frac( b );
2309     bExp = extractFloat32Exp( b );
2310     bSign = extractFloat32Sign( b );
2311     zSign = aSign ^ bSign;
2312     if ( aExp == 0xFF ) {
2313         if (aSig) {
2314             return propagateFloat32NaN(a, b, status);
2315         }
2316         if ( bExp == 0xFF ) {
2317             if (bSig) {
2318                 return propagateFloat32NaN(a, b, status);
2319             }
2320             float_raise(float_flag_invalid, status);
2321             return float32_default_nan(status);
2322         }
2323         return packFloat32( zSign, 0xFF, 0 );
2324     }
2325     if ( bExp == 0xFF ) {
2326         if (bSig) {
2327             return propagateFloat32NaN(a, b, status);
2328         }
2329         return packFloat32( zSign, 0, 0 );
2330     }
2331     if ( bExp == 0 ) {
2332         if ( bSig == 0 ) {
2333             if ( ( aExp | aSig ) == 0 ) {
2334                 float_raise(float_flag_invalid, status);
2335                 return float32_default_nan(status);
2336             }
2337             float_raise(float_flag_divbyzero, status);
2338             return packFloat32( zSign, 0xFF, 0 );
2339         }
2340         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2341     }
2342     if ( aExp == 0 ) {
2343         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2344         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2345     }
2346     zExp = aExp - bExp + 0x7D;
2347     aSig = ( aSig | 0x00800000 )<<7;
2348     bSig = ( bSig | 0x00800000 )<<8;
2349     if ( bSig <= ( aSig + aSig ) ) {
2350         aSig >>= 1;
2351         ++zExp;
2352     }
2353     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2354     if ( ( zSig & 0x3F ) == 0 ) {
2355         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2356     }
2357     return roundAndPackFloat32(zSign, zExp, zSig, status);
2358
2359 }
2360
2361 /*----------------------------------------------------------------------------
2362 | Returns the remainder of the single-precision floating-point value `a'
2363 | with respect to the corresponding value `b'.  The operation is performed
2364 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2365 *----------------------------------------------------------------------------*/
2366
2367 float32 float32_rem(float32 a, float32 b, float_status *status)
2368 {
2369     flag aSign, zSign;
2370     int aExp, bExp, expDiff;
2371     uint32_t aSig, bSig;
2372     uint32_t q;
2373     uint64_t aSig64, bSig64, q64;
2374     uint32_t alternateASig;
2375     int32_t sigMean;
2376     a = float32_squash_input_denormal(a, status);
2377     b = float32_squash_input_denormal(b, status);
2378
2379     aSig = extractFloat32Frac( a );
2380     aExp = extractFloat32Exp( a );
2381     aSign = extractFloat32Sign( a );
2382     bSig = extractFloat32Frac( b );
2383     bExp = extractFloat32Exp( b );
2384     if ( aExp == 0xFF ) {
2385         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2386             return propagateFloat32NaN(a, b, status);
2387         }
2388         float_raise(float_flag_invalid, status);
2389         return float32_default_nan(status);
2390     }
2391     if ( bExp == 0xFF ) {
2392         if (bSig) {
2393             return propagateFloat32NaN(a, b, status);
2394         }
2395         return a;
2396     }
2397     if ( bExp == 0 ) {
2398         if ( bSig == 0 ) {
2399             float_raise(float_flag_invalid, status);
2400             return float32_default_nan(status);
2401         }
2402         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2403     }
2404     if ( aExp == 0 ) {
2405         if ( aSig == 0 ) return a;
2406         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2407     }
2408     expDiff = aExp - bExp;
2409     aSig |= 0x00800000;
2410     bSig |= 0x00800000;
2411     if ( expDiff < 32 ) {
2412         aSig <<= 8;
2413         bSig <<= 8;
2414         if ( expDiff < 0 ) {
2415             if ( expDiff < -1 ) return a;
2416             aSig >>= 1;
2417         }
2418         q = ( bSig <= aSig );
2419         if ( q ) aSig -= bSig;
2420         if ( 0 < expDiff ) {
2421             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2422             q >>= 32 - expDiff;
2423             bSig >>= 2;
2424             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2425         }
2426         else {
2427             aSig >>= 2;
2428             bSig >>= 2;
2429         }
2430     }
2431     else {
2432         if ( bSig <= aSig ) aSig -= bSig;
2433         aSig64 = ( (uint64_t) aSig )<<40;
2434         bSig64 = ( (uint64_t) bSig )<<40;
2435         expDiff -= 64;
2436         while ( 0 < expDiff ) {
2437             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2438             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2439             aSig64 = - ( ( bSig * q64 )<<38 );
2440             expDiff -= 62;
2441         }
2442         expDiff += 64;
2443         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2444         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2445         q = q64>>( 64 - expDiff );
2446         bSig <<= 6;
2447         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2448     }
2449     do {
2450         alternateASig = aSig;
2451         ++q;
2452         aSig -= bSig;
2453     } while ( 0 <= (int32_t) aSig );
2454     sigMean = aSig + alternateASig;
2455     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2456         aSig = alternateASig;
2457     }
2458     zSign = ( (int32_t) aSig < 0 );
2459     if ( zSign ) aSig = - aSig;
2460     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2461 }
2462
2463 /*----------------------------------------------------------------------------
2464 | Returns the result of multiplying the single-precision floating-point values
2465 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2466 | multiplication.  The operation is performed according to the IEC/IEEE
2467 | Standard for Binary Floating-Point Arithmetic 754-2008.
2468 | The flags argument allows the caller to select negation of the
2469 | addend, the intermediate product, or the final result. (The difference
2470 | between this and having the caller do a separate negation is that negating
2471 | externally will flip the sign bit on NaNs.)
2472 *----------------------------------------------------------------------------*/
2473
2474 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2475                        float_status *status)
2476 {
2477     flag aSign, bSign, cSign, zSign;
2478     int aExp, bExp, cExp, pExp, zExp, expDiff;
2479     uint32_t aSig, bSig, cSig;
2480     flag pInf, pZero, pSign;
2481     uint64_t pSig64, cSig64, zSig64;
2482     uint32_t pSig;
2483     int shiftcount;
2484     flag signflip, infzero;
2485
2486     a = float32_squash_input_denormal(a, status);
2487     b = float32_squash_input_denormal(b, status);
2488     c = float32_squash_input_denormal(c, status);
2489     aSig = extractFloat32Frac(a);
2490     aExp = extractFloat32Exp(a);
2491     aSign = extractFloat32Sign(a);
2492     bSig = extractFloat32Frac(b);
2493     bExp = extractFloat32Exp(b);
2494     bSign = extractFloat32Sign(b);
2495     cSig = extractFloat32Frac(c);
2496     cExp = extractFloat32Exp(c);
2497     cSign = extractFloat32Sign(c);
2498
2499     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2500                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2501
2502     /* It is implementation-defined whether the cases of (0,inf,qnan)
2503      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2504      * they return if they do), so we have to hand this information
2505      * off to the target-specific pick-a-NaN routine.
2506      */
2507     if (((aExp == 0xff) && aSig) ||
2508         ((bExp == 0xff) && bSig) ||
2509         ((cExp == 0xff) && cSig)) {
2510         return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2511     }
2512
2513     if (infzero) {
2514         float_raise(float_flag_invalid, status);
2515         return float32_default_nan(status);
2516     }
2517
2518     if (flags & float_muladd_negate_c) {
2519         cSign ^= 1;
2520     }
2521
2522     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2523
2524     /* Work out the sign and type of the product */
2525     pSign = aSign ^ bSign;
2526     if (flags & float_muladd_negate_product) {
2527         pSign ^= 1;
2528     }
2529     pInf = (aExp == 0xff) || (bExp == 0xff);
2530     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2531
2532     if (cExp == 0xff) {
2533         if (pInf && (pSign ^ cSign)) {
2534             /* addition of opposite-signed infinities => InvalidOperation */
2535             float_raise(float_flag_invalid, status);
2536             return float32_default_nan(status);
2537         }
2538         /* Otherwise generate an infinity of the same sign */
2539         return packFloat32(cSign ^ signflip, 0xff, 0);
2540     }
2541
2542     if (pInf) {
2543         return packFloat32(pSign ^ signflip, 0xff, 0);
2544     }
2545
2546     if (pZero) {
2547         if (cExp == 0) {
2548             if (cSig == 0) {
2549                 /* Adding two exact zeroes */
2550                 if (pSign == cSign) {
2551                     zSign = pSign;
2552                 } else if (status->float_rounding_mode == float_round_down) {
2553                     zSign = 1;
2554                 } else {
2555                     zSign = 0;
2556                 }
2557                 return packFloat32(zSign ^ signflip, 0, 0);
2558             }
2559             /* Exact zero plus a denorm */
2560             if (status->flush_to_zero) {
2561                 float_raise(float_flag_output_denormal, status);
2562                 return packFloat32(cSign ^ signflip, 0, 0);
2563             }
2564         }
2565         /* Zero plus something non-zero : just return the something */
2566         if (flags & float_muladd_halve_result) {
2567             if (cExp == 0) {
2568                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2569             }
2570             /* Subtract one to halve, and one again because roundAndPackFloat32
2571              * wants one less than the true exponent.
2572              */
2573             cExp -= 2;
2574             cSig = (cSig | 0x00800000) << 7;
2575             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2576         }
2577         return packFloat32(cSign ^ signflip, cExp, cSig);
2578     }
2579
2580     if (aExp == 0) {
2581         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2582     }
2583     if (bExp == 0) {
2584         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2585     }
2586
2587     /* Calculate the actual result a * b + c */
2588
2589     /* Multiply first; this is easy. */
2590     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2591      * because we want the true exponent, not the "one-less-than"
2592      * flavour that roundAndPackFloat32() takes.
2593      */
2594     pExp = aExp + bExp - 0x7e;
2595     aSig = (aSig | 0x00800000) << 7;
2596     bSig = (bSig | 0x00800000) << 8;
2597     pSig64 = (uint64_t)aSig * bSig;
2598     if ((int64_t)(pSig64 << 1) >= 0) {
2599         pSig64 <<= 1;
2600         pExp--;
2601     }
2602
2603     zSign = pSign ^ signflip;
2604
2605     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2606      * position 62.
2607      */
2608     if (cExp == 0) {
2609         if (!cSig) {
2610             /* Throw out the special case of c being an exact zero now */
2611             shift64RightJamming(pSig64, 32, &pSig64);
2612             pSig = pSig64;
2613             if (flags & float_muladd_halve_result) {
2614                 pExp--;
2615             }
2616             return roundAndPackFloat32(zSign, pExp - 1,
2617                                        pSig, status);
2618         }
2619         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2620     }
2621
2622     cSig64 = (uint64_t)cSig << (62 - 23);
2623     cSig64 |= LIT64(0x4000000000000000);
2624     expDiff = pExp - cExp;
2625
2626     if (pSign == cSign) {
2627         /* Addition */
2628         if (expDiff > 0) {
2629             /* scale c to match p */
2630             shift64RightJamming(cSig64, expDiff, &cSig64);
2631             zExp = pExp;
2632         } else if (expDiff < 0) {
2633             /* scale p to match c */
2634             shift64RightJamming(pSig64, -expDiff, &pSig64);
2635             zExp = cExp;
2636         } else {
2637             /* no scaling needed */
2638             zExp = cExp;
2639         }
2640         /* Add significands and make sure explicit bit ends up in posn 62 */
2641         zSig64 = pSig64 + cSig64;
2642         if ((int64_t)zSig64 < 0) {
2643             shift64RightJamming(zSig64, 1, &zSig64);
2644         } else {
2645             zExp--;
2646         }
2647     } else {
2648         /* Subtraction */
2649         if (expDiff > 0) {
2650             shift64RightJamming(cSig64, expDiff, &cSig64);
2651             zSig64 = pSig64 - cSig64;
2652             zExp = pExp;
2653         } else if (expDiff < 0) {
2654             shift64RightJamming(pSig64, -expDiff, &pSig64);
2655             zSig64 = cSig64 - pSig64;
2656             zExp = cExp;
2657             zSign ^= 1;
2658         } else {
2659             zExp = pExp;
2660             if (cSig64 < pSig64) {
2661                 zSig64 = pSig64 - cSig64;
2662             } else if (pSig64 < cSig64) {
2663                 zSig64 = cSig64 - pSig64;
2664                 zSign ^= 1;
2665             } else {
2666                 /* Exact zero */
2667                 zSign = signflip;
2668                 if (status->float_rounding_mode == float_round_down) {
2669                     zSign ^= 1;
2670                 }
2671                 return packFloat32(zSign, 0, 0);
2672             }
2673         }
2674         --zExp;
2675         /* Normalize to put the explicit bit back into bit 62. */
2676         shiftcount = countLeadingZeros64(zSig64) - 1;
2677         zSig64 <<= shiftcount;
2678         zExp -= shiftcount;
2679     }
2680     if (flags & float_muladd_halve_result) {
2681         zExp--;
2682     }
2683
2684     shift64RightJamming(zSig64, 32, &zSig64);
2685     return roundAndPackFloat32(zSign, zExp, zSig64, status);
2686 }
2687
2688
2689 /*----------------------------------------------------------------------------
2690 | Returns the square root of the single-precision floating-point value `a'.
2691 | The operation is performed according to the IEC/IEEE Standard for Binary
2692 | Floating-Point Arithmetic.
2693 *----------------------------------------------------------------------------*/
2694
2695 float32 float32_sqrt(float32 a, float_status *status)
2696 {
2697     flag aSign;
2698     int aExp, zExp;
2699     uint32_t aSig, zSig;
2700     uint64_t rem, term;
2701     a = float32_squash_input_denormal(a, status);
2702
2703     aSig = extractFloat32Frac( a );
2704     aExp = extractFloat32Exp( a );
2705     aSign = extractFloat32Sign( a );
2706     if ( aExp == 0xFF ) {
2707         if (aSig) {
2708             return propagateFloat32NaN(a, float32_zero, status);
2709         }
2710         if ( ! aSign ) return a;
2711         float_raise(float_flag_invalid, status);
2712         return float32_default_nan(status);
2713     }
2714     if ( aSign ) {
2715         if ( ( aExp | aSig ) == 0 ) return a;
2716         float_raise(float_flag_invalid, status);
2717         return float32_default_nan(status);
2718     }
2719     if ( aExp == 0 ) {
2720         if ( aSig == 0 ) return float32_zero;
2721         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2722     }
2723     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2724     aSig = ( aSig | 0x00800000 )<<8;
2725     zSig = estimateSqrt32( aExp, aSig ) + 2;
2726     if ( ( zSig & 0x7F ) <= 5 ) {
2727         if ( zSig < 2 ) {
2728             zSig = 0x7FFFFFFF;
2729             goto roundAndPack;
2730         }
2731         aSig >>= aExp & 1;
2732         term = ( (uint64_t) zSig ) * zSig;
2733         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2734         while ( (int64_t) rem < 0 ) {
2735             --zSig;
2736             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2737         }
2738         zSig |= ( rem != 0 );
2739     }
2740     shift32RightJamming( zSig, 1, &zSig );
2741  roundAndPack:
2742     return roundAndPackFloat32(0, zExp, zSig, status);
2743
2744 }
2745
2746 /*----------------------------------------------------------------------------
2747 | Returns the binary exponential of the single-precision floating-point value
2748 | `a'. The operation is performed according to the IEC/IEEE Standard for
2749 | Binary Floating-Point Arithmetic.
2750 |
2751 | Uses the following identities:
2752 |
2753 | 1. -------------------------------------------------------------------------
2754 |      x    x*ln(2)
2755 |     2  = e
2756 |
2757 | 2. -------------------------------------------------------------------------
2758 |                      2     3     4     5           n
2759 |      x        x     x     x     x     x           x
2760 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2761 |               1!    2!    3!    4!    5!          n!
2762 *----------------------------------------------------------------------------*/
2763
2764 static const float64 float32_exp2_coefficients[15] =
2765 {
2766     const_float64( 0x3ff0000000000000ll ), /*  1 */
2767     const_float64( 0x3fe0000000000000ll ), /*  2 */
2768     const_float64( 0x3fc5555555555555ll ), /*  3 */
2769     const_float64( 0x3fa5555555555555ll ), /*  4 */
2770     const_float64( 0x3f81111111111111ll ), /*  5 */
2771     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2772     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2773     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2774     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2775     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2776     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2777     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2778     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2779     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2780     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2781 };
2782
2783 float32 float32_exp2(float32 a, float_status *status)
2784 {
2785     flag aSign;
2786     int aExp;
2787     uint32_t aSig;
2788     float64 r, x, xn;
2789     int i;
2790     a = float32_squash_input_denormal(a, status);
2791
2792     aSig = extractFloat32Frac( a );
2793     aExp = extractFloat32Exp( a );
2794     aSign = extractFloat32Sign( a );
2795
2796     if ( aExp == 0xFF) {
2797         if (aSig) {
2798             return propagateFloat32NaN(a, float32_zero, status);
2799         }
2800         return (aSign) ? float32_zero : a;
2801     }
2802     if (aExp == 0) {
2803         if (aSig == 0) return float32_one;
2804     }
2805
2806     float_raise(float_flag_inexact, status);
2807
2808     /* ******************************* */
2809     /* using float64 for approximation */
2810     /* ******************************* */
2811     x = float32_to_float64(a, status);
2812     x = float64_mul(x, float64_ln2, status);
2813
2814     xn = x;
2815     r = float64_one;
2816     for (i = 0 ; i < 15 ; i++) {
2817         float64 f;
2818
2819         f = float64_mul(xn, float32_exp2_coefficients[i], status);
2820         r = float64_add(r, f, status);
2821
2822         xn = float64_mul(xn, x, status);
2823     }
2824
2825     return float64_to_float32(r, status);
2826 }
2827
2828 /*----------------------------------------------------------------------------
2829 | Returns the binary log of the single-precision floating-point value `a'.
2830 | The operation is performed according to the IEC/IEEE Standard for Binary
2831 | Floating-Point Arithmetic.
2832 *----------------------------------------------------------------------------*/
2833 float32 float32_log2(float32 a, float_status *status)
2834 {
2835     flag aSign, zSign;
2836     int aExp;
2837     uint32_t aSig, zSig, i;
2838
2839     a = float32_squash_input_denormal(a, status);
2840     aSig = extractFloat32Frac( a );
2841     aExp = extractFloat32Exp( a );
2842     aSign = extractFloat32Sign( a );
2843
2844     if ( aExp == 0 ) {
2845         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2846         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2847     }
2848     if ( aSign ) {
2849         float_raise(float_flag_invalid, status);
2850         return float32_default_nan(status);
2851     }
2852     if ( aExp == 0xFF ) {
2853         if (aSig) {
2854             return propagateFloat32NaN(a, float32_zero, status);
2855         }
2856         return a;
2857     }
2858
2859     aExp -= 0x7F;
2860     aSig |= 0x00800000;
2861     zSign = aExp < 0;
2862     zSig = aExp << 23;
2863
2864     for (i = 1 << 22; i > 0; i >>= 1) {
2865         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2866         if ( aSig & 0x01000000 ) {
2867             aSig >>= 1;
2868             zSig |= i;
2869         }
2870     }
2871
2872     if ( zSign )
2873         zSig = -zSig;
2874
2875     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2876 }
2877
2878 /*----------------------------------------------------------------------------
2879 | Returns 1 if the single-precision floating-point value `a' is equal to
2880 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2881 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2882 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2884
2885 int float32_eq(float32 a, float32 b, float_status *status)
2886 {
2887     uint32_t av, bv;
2888     a = float32_squash_input_denormal(a, status);
2889     b = float32_squash_input_denormal(b, status);
2890
2891     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2892          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2893        ) {
2894         float_raise(float_flag_invalid, status);
2895         return 0;
2896     }
2897     av = float32_val(a);
2898     bv = float32_val(b);
2899     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2900 }
2901
2902 /*----------------------------------------------------------------------------
2903 | Returns 1 if the single-precision floating-point value `a' is less than
2904 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2905 | exception is raised if either operand is a NaN.  The comparison is performed
2906 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2907 *----------------------------------------------------------------------------*/
2908
2909 int float32_le(float32 a, float32 b, float_status *status)
2910 {
2911     flag aSign, bSign;
2912     uint32_t av, bv;
2913     a = float32_squash_input_denormal(a, status);
2914     b = float32_squash_input_denormal(b, status);
2915
2916     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2917          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2918        ) {
2919         float_raise(float_flag_invalid, status);
2920         return 0;
2921     }
2922     aSign = extractFloat32Sign( a );
2923     bSign = extractFloat32Sign( b );
2924     av = float32_val(a);
2925     bv = float32_val(b);
2926     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2927     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2928
2929 }
2930
2931 /*----------------------------------------------------------------------------
2932 | Returns 1 if the single-precision floating-point value `a' is less than
2933 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2934 | raised if either operand is a NaN.  The comparison is performed according
2935 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2936 *----------------------------------------------------------------------------*/
2937
2938 int float32_lt(float32 a, float32 b, float_status *status)
2939 {
2940     flag aSign, bSign;
2941     uint32_t av, bv;
2942     a = float32_squash_input_denormal(a, status);
2943     b = float32_squash_input_denormal(b, status);
2944
2945     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2946          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2947        ) {
2948         float_raise(float_flag_invalid, status);
2949         return 0;
2950     }
2951     aSign = extractFloat32Sign( a );
2952     bSign = extractFloat32Sign( b );
2953     av = float32_val(a);
2954     bv = float32_val(b);
2955     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2956     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2957
2958 }
2959
2960 /*----------------------------------------------------------------------------
2961 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2962 | be compared, and 0 otherwise.  The invalid exception is raised if either
2963 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2964 | Standard for Binary Floating-Point Arithmetic.
2965 *----------------------------------------------------------------------------*/
2966
2967 int float32_unordered(float32 a, float32 b, float_status *status)
2968 {
2969     a = float32_squash_input_denormal(a, status);
2970     b = float32_squash_input_denormal(b, status);
2971
2972     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2973          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2974        ) {
2975         float_raise(float_flag_invalid, status);
2976         return 1;
2977     }
2978     return 0;
2979 }
2980
2981 /*----------------------------------------------------------------------------
2982 | Returns 1 if the single-precision floating-point value `a' is equal to
2983 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2984 | exception.  The comparison is performed according to the IEC/IEEE Standard
2985 | for Binary Floating-Point Arithmetic.
2986 *----------------------------------------------------------------------------*/
2987
2988 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2989 {
2990     a = float32_squash_input_denormal(a, status);
2991     b = float32_squash_input_denormal(b, status);
2992
2993     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2994          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2995        ) {
2996         if (float32_is_signaling_nan(a, status)
2997          || float32_is_signaling_nan(b, status)) {
2998             float_raise(float_flag_invalid, status);
2999         }
3000         return 0;
3001     }
3002     return ( float32_val(a) == float32_val(b) ) ||
3003             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3004 }
3005
3006 /*----------------------------------------------------------------------------
3007 | Returns 1 if the single-precision floating-point value `a' is less than or
3008 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3009 | cause an exception.  Otherwise, the comparison is performed according to the
3010 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3011 *----------------------------------------------------------------------------*/
3012
3013 int float32_le_quiet(float32 a, float32 b, float_status *status)
3014 {
3015     flag aSign, bSign;
3016     uint32_t av, bv;
3017     a = float32_squash_input_denormal(a, status);
3018     b = float32_squash_input_denormal(b, status);
3019
3020     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3021          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3022        ) {
3023         if (float32_is_signaling_nan(a, status)
3024          || float32_is_signaling_nan(b, status)) {
3025             float_raise(float_flag_invalid, status);
3026         }
3027         return 0;
3028     }
3029     aSign = extractFloat32Sign( a );
3030     bSign = extractFloat32Sign( b );
3031     av = float32_val(a);
3032     bv = float32_val(b);
3033     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3034     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3035
3036 }
3037
3038 /*----------------------------------------------------------------------------
3039 | Returns 1 if the single-precision floating-point value `a' is less than
3040 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3041 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3042 | Standard for Binary Floating-Point Arithmetic.
3043 *----------------------------------------------------------------------------*/
3044
3045 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3046 {
3047     flag aSign, bSign;
3048     uint32_t av, bv;
3049     a = float32_squash_input_denormal(a, status);
3050     b = float32_squash_input_denormal(b, status);
3051
3052     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3053          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3054        ) {
3055         if (float32_is_signaling_nan(a, status)
3056          || float32_is_signaling_nan(b, status)) {
3057             float_raise(float_flag_invalid, status);
3058         }
3059         return 0;
3060     }
3061     aSign = extractFloat32Sign( a );
3062     bSign = extractFloat32Sign( b );
3063     av = float32_val(a);
3064     bv = float32_val(b);
3065     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3066     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3067
3068 }
3069
3070 /*----------------------------------------------------------------------------
3071 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3072 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3073 | comparison is performed according to the IEC/IEEE Standard for Binary
3074 | Floating-Point Arithmetic.
3075 *----------------------------------------------------------------------------*/
3076
3077 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3078 {
3079     a = float32_squash_input_denormal(a, status);
3080     b = float32_squash_input_denormal(b, status);
3081
3082     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3083          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3084        ) {
3085         if (float32_is_signaling_nan(a, status)
3086          || float32_is_signaling_nan(b, status)) {
3087             float_raise(float_flag_invalid, status);
3088         }
3089         return 1;
3090     }
3091     return 0;
3092 }
3093
3094 /*----------------------------------------------------------------------------
3095 | Returns the result of converting the double-precision floating-point value
3096 | `a' to the 32-bit two's complement integer format.  The conversion is
3097 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3098 | Arithmetic---which means in particular that the conversion is rounded
3099 | according to the current rounding mode.  If `a' is a NaN, the largest
3100 | positive integer is returned.  Otherwise, if the conversion overflows, the
3101 | largest integer with the same sign as `a' is returned.
3102 *----------------------------------------------------------------------------*/
3103
3104 int32_t float64_to_int32(float64 a, float_status *status)
3105 {
3106     flag aSign;
3107     int aExp;
3108     int shiftCount;
3109     uint64_t aSig;
3110     a = float64_squash_input_denormal(a, status);
3111
3112     aSig = extractFloat64Frac( a );
3113     aExp = extractFloat64Exp( a );
3114     aSign = extractFloat64Sign( a );
3115     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3116     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3117     shiftCount = 0x42C - aExp;
3118     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3119     return roundAndPackInt32(aSign, aSig, status);
3120
3121 }
3122
3123 /*----------------------------------------------------------------------------
3124 | Returns the result of converting the double-precision floating-point value
3125 | `a' to the 32-bit two's complement integer format.  The conversion is
3126 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3127 | Arithmetic, except that the conversion is always rounded toward zero.
3128 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3129 | the conversion overflows, the largest integer with the same sign as `a' is
3130 | returned.
3131 *----------------------------------------------------------------------------*/
3132
3133 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3134 {
3135     flag aSign;
3136     int aExp;
3137     int shiftCount;
3138     uint64_t aSig, savedASig;
3139     int32_t z;
3140     a = float64_squash_input_denormal(a, status);
3141
3142     aSig = extractFloat64Frac( a );
3143     aExp = extractFloat64Exp( a );
3144     aSign = extractFloat64Sign( a );
3145     if ( 0x41E < aExp ) {
3146         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3147         goto invalid;
3148     }
3149     else if ( aExp < 0x3FF ) {
3150         if (aExp || aSig) {
3151             status->float_exception_flags |= float_flag_inexact;
3152         }
3153         return 0;
3154     }
3155     aSig |= LIT64( 0x0010000000000000 );
3156     shiftCount = 0x433 - aExp;
3157     savedASig = aSig;
3158     aSig >>= shiftCount;
3159     z = aSig;
3160     if ( aSign ) z = - z;
3161     if ( ( z < 0 ) ^ aSign ) {
3162  invalid:
3163         float_raise(float_flag_invalid, status);
3164         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3165     }
3166     if ( ( aSig<<shiftCount ) != savedASig ) {
3167         status->float_exception_flags |= float_flag_inexact;
3168     }
3169     return z;
3170
3171 }
3172
3173 /*----------------------------------------------------------------------------
3174 | Returns the result of converting the double-precision floating-point value
3175 | `a' to the 16-bit two's complement integer format.  The conversion is
3176 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3177 | Arithmetic, except that the conversion is always rounded toward zero.
3178 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3179 | the conversion overflows, the largest integer with the same sign as `a' is
3180 | returned.
3181 *----------------------------------------------------------------------------*/
3182
3183 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3184 {
3185     flag aSign;
3186     int aExp;
3187     int shiftCount;
3188     uint64_t aSig, savedASig;
3189     int32_t z;
3190
3191     aSig = extractFloat64Frac( a );
3192     aExp = extractFloat64Exp( a );
3193     aSign = extractFloat64Sign( a );
3194     if ( 0x40E < aExp ) {
3195         if ( ( aExp == 0x7FF ) && aSig ) {
3196             aSign = 0;
3197         }
3198         goto invalid;
3199     }
3200     else if ( aExp < 0x3FF ) {
3201         if ( aExp || aSig ) {
3202             status->float_exception_flags |= float_flag_inexact;
3203         }
3204         return 0;
3205     }
3206     aSig |= LIT64( 0x0010000000000000 );
3207     shiftCount = 0x433 - aExp;
3208     savedASig = aSig;
3209     aSig >>= shiftCount;
3210     z = aSig;
3211     if ( aSign ) {
3212         z = - z;
3213     }
3214     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3215  invalid:
3216         float_raise(float_flag_invalid, status);
3217         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3218     }
3219     if ( ( aSig<<shiftCount ) != savedASig ) {
3220         status->float_exception_flags |= float_flag_inexact;
3221     }
3222     return z;
3223 }
3224
3225 /*----------------------------------------------------------------------------
3226 | Returns the result of converting the double-precision floating-point value
3227 | `a' to the 64-bit two's complement integer format.  The conversion is
3228 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3229 | Arithmetic---which means in particular that the conversion is rounded
3230 | according to the current rounding mode.  If `a' is a NaN, the largest
3231 | positive integer is returned.  Otherwise, if the conversion overflows, the
3232 | largest integer with the same sign as `a' is returned.
3233 *----------------------------------------------------------------------------*/
3234
3235 int64_t float64_to_int64(float64 a, float_status *status)
3236 {
3237     flag aSign;
3238     int aExp;
3239     int shiftCount;
3240     uint64_t aSig, aSigExtra;
3241     a = float64_squash_input_denormal(a, status);
3242
3243     aSig = extractFloat64Frac( a );
3244     aExp = extractFloat64Exp( a );
3245     aSign = extractFloat64Sign( a );
3246     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3247     shiftCount = 0x433 - aExp;
3248     if ( shiftCount <= 0 ) {
3249         if ( 0x43E < aExp ) {
3250             float_raise(float_flag_invalid, status);
3251             if (    ! aSign
3252                  || (    ( aExp == 0x7FF )
3253                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3254                ) {
3255                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3256             }
3257             return (int64_t) LIT64( 0x8000000000000000 );
3258         }
3259         aSigExtra = 0;
3260         aSig <<= - shiftCount;
3261     }
3262     else {
3263         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3264     }
3265     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3266
3267 }
3268
3269 /*----------------------------------------------------------------------------
3270 | Returns the result of converting the double-precision floating-point value
3271 | `a' to the 64-bit two's complement integer format.  The conversion is
3272 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3273 | Arithmetic, except that the conversion is always rounded toward zero.
3274 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3275 | the conversion overflows, the largest integer with the same sign as `a' is
3276 | returned.
3277 *----------------------------------------------------------------------------*/
3278
3279 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3280 {
3281     flag aSign;
3282     int aExp;
3283     int shiftCount;
3284     uint64_t aSig;
3285     int64_t z;
3286     a = float64_squash_input_denormal(a, status);
3287
3288     aSig = extractFloat64Frac( a );
3289     aExp = extractFloat64Exp( a );
3290     aSign = extractFloat64Sign( a );
3291     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3292     shiftCount = aExp - 0x433;
3293     if ( 0 <= shiftCount ) {
3294         if ( 0x43E <= aExp ) {
3295             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3296                 float_raise(float_flag_invalid, status);
3297                 if (    ! aSign
3298                      || (    ( aExp == 0x7FF )
3299                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3300                    ) {
3301                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3302                 }
3303             }
3304             return (int64_t) LIT64( 0x8000000000000000 );
3305         }
3306         z = aSig<<shiftCount;
3307     }
3308     else {
3309         if ( aExp < 0x3FE ) {
3310             if (aExp | aSig) {
3311                 status->float_exception_flags |= float_flag_inexact;
3312             }
3313             return 0;
3314         }
3315         z = aSig>>( - shiftCount );
3316         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3317             status->float_exception_flags |= float_flag_inexact;
3318         }
3319     }
3320     if ( aSign ) z = - z;
3321     return z;
3322
3323 }
3324
3325 /*----------------------------------------------------------------------------
3326 | Returns the result of converting the double-precision floating-point value
3327 | `a' to the single-precision floating-point format.  The conversion is
3328 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3329 | Arithmetic.
3330 *----------------------------------------------------------------------------*/
3331
3332 float32 float64_to_float32(float64 a, float_status *status)
3333 {
3334     flag aSign;
3335     int aExp;
3336     uint64_t aSig;
3337     uint32_t zSig;
3338     a = float64_squash_input_denormal(a, status);
3339
3340     aSig = extractFloat64Frac( a );
3341     aExp = extractFloat64Exp( a );
3342     aSign = extractFloat64Sign( a );
3343     if ( aExp == 0x7FF ) {
3344         if (aSig) {
3345             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3346         }
3347         return packFloat32( aSign, 0xFF, 0 );
3348     }
3349     shift64RightJamming( aSig, 22, &aSig );
3350     zSig = aSig;
3351     if ( aExp || zSig ) {
3352         zSig |= 0x40000000;
3353         aExp -= 0x381;
3354     }
3355     return roundAndPackFloat32(aSign, aExp, zSig, status);
3356
3357 }
3358
3359
3360 /*----------------------------------------------------------------------------
3361 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3362 | half-precision floating-point value, returning the result.  After being
3363 | shifted into the proper positions, the three fields are simply added
3364 | together to form the result.  This means that any integer portion of `zSig'
3365 | will be added into the exponent.  Since a properly normalized significand
3366 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3367 | than the desired result exponent whenever `zSig' is a complete, normalized
3368 | significand.
3369 *----------------------------------------------------------------------------*/
3370 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3371 {
3372     return make_float16(
3373         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3374 }
3375
3376 /*----------------------------------------------------------------------------
3377 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3378 | and significand `zSig', and returns the proper half-precision floating-
3379 | point value corresponding to the abstract input.  Ordinarily, the abstract
3380 | value is simply rounded and packed into the half-precision format, with
3381 | the inexact exception raised if the abstract input cannot be represented
3382 | exactly.  However, if the abstract value is too large, the overflow and
3383 | inexact exceptions are raised and an infinity or maximal finite value is
3384 | returned.  If the abstract value is too small, the input value is rounded to
3385 | a subnormal number, and the underflow and inexact exceptions are raised if
3386 | the abstract input cannot be represented exactly as a subnormal half-
3387 | precision floating-point number.
3388 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3389 | ARM-style "alternative representation", which omits the NaN and Inf
3390 | encodings in order to raise the maximum representable exponent by one.
3391 |     The input significand `zSig' has its binary point between bits 22
3392 | and 23, which is 13 bits to the left of the usual location.  This shifted
3393 | significand must be normalized or smaller.  If `zSig' is not normalized,
3394 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3395 | and it must not require rounding.  In the usual case that `zSig' is
3396 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3397 | Note the slightly odd position of the binary point in zSig compared with the
3398 | other roundAndPackFloat functions. This should probably be fixed if we
3399 | need to implement more float16 routines than just conversion.
3400 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3401 | Binary Floating-Point Arithmetic.
3402 *----------------------------------------------------------------------------*/
3403
3404 static float16 roundAndPackFloat16(flag zSign, int zExp,
3405                                    uint32_t zSig, flag ieee,
3406                                    float_status *status)
3407 {
3408     int maxexp = ieee ? 29 : 30;
3409     uint32_t mask;
3410     uint32_t increment;
3411     bool rounding_bumps_exp;
3412     bool is_tiny = false;
3413
3414     /* Calculate the mask of bits of the mantissa which are not
3415      * representable in half-precision and will be lost.
3416      */
3417     if (zExp < 1) {
3418         /* Will be denormal in halfprec */
3419         mask = 0x00ffffff;
3420         if (zExp >= -11) {
3421             mask >>= 11 + zExp;
3422         }
3423     } else {
3424         /* Normal number in halfprec */
3425         mask = 0x00001fff;
3426     }
3427
3428     switch (status->float_rounding_mode) {
3429     case float_round_nearest_even:
3430         increment = (mask + 1) >> 1;
3431         if ((zSig & mask) == increment) {
3432             increment = zSig & (increment << 1);
3433         }
3434         break;
3435     case float_round_ties_away:
3436         increment = (mask + 1) >> 1;
3437         break;
3438     case float_round_up:
3439         increment = zSign ? 0 : mask;
3440         break;
3441     case float_round_down:
3442         increment = zSign ? mask : 0;
3443         break;
3444     default: /* round_to_zero */
3445         increment = 0;
3446         break;
3447     }
3448
3449     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3450
3451     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3452         if (ieee) {
3453             float_raise(float_flag_overflow | float_flag_inexact, status);
3454             return packFloat16(zSign, 0x1f, 0);
3455         } else {
3456             float_raise(float_flag_invalid, status);
3457             return packFloat16(zSign, 0x1f, 0x3ff);
3458         }
3459     }
3460
3461     if (zExp < 0) {
3462         /* Note that flush-to-zero does not affect half-precision results */
3463         is_tiny =
3464             (status->float_detect_tininess == float_tininess_before_rounding)
3465             || (zExp < -1)
3466             || (!rounding_bumps_exp);
3467     }
3468     if (zSig & mask) {
3469         float_raise(float_flag_inexact, status);
3470         if (is_tiny) {
3471             float_raise(float_flag_underflow, status);
3472         }
3473     }
3474
3475     zSig += increment;
3476     if (rounding_bumps_exp) {
3477         zSig >>= 1;
3478         zExp++;
3479     }
3480
3481     if (zExp < -10) {
3482         return packFloat16(zSign, 0, 0);
3483     }
3484     if (zExp < 0) {
3485         zSig >>= -zExp;
3486         zExp = 0;
3487     }
3488     return packFloat16(zSign, zExp, zSig >> 13);
3489 }
3490
3491 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3492                                       uint32_t *zSigPtr)
3493 {
3494     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3495     *zSigPtr = aSig << shiftCount;
3496     *zExpPtr = 1 - shiftCount;
3497 }
3498
3499 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3500    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3501
3502 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3503 {
3504     flag aSign;
3505     int aExp;
3506     uint32_t aSig;
3507
3508     aSign = extractFloat16Sign(a);
3509     aExp = extractFloat16Exp(a);
3510     aSig = extractFloat16Frac(a);
3511
3512     if (aExp == 0x1f && ieee) {
3513         if (aSig) {
3514             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3515         }
3516         return packFloat32(aSign, 0xff, 0);
3517     }
3518     if (aExp == 0) {
3519         if (aSig == 0) {
3520             return packFloat32(aSign, 0, 0);
3521         }
3522
3523         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3524         aExp--;
3525     }
3526     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3527 }
3528
3529 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3530 {
3531     flag aSign;
3532     int aExp;
3533     uint32_t aSig;
3534
3535     a = float32_squash_input_denormal(a, status);
3536
3537     aSig = extractFloat32Frac( a );
3538     aExp = extractFloat32Exp( a );
3539     aSign = extractFloat32Sign( a );
3540     if ( aExp == 0xFF ) {
3541         if (aSig) {
3542             /* Input is a NaN */
3543             if (!ieee) {
3544                 float_raise(float_flag_invalid, status);
3545                 return packFloat16(aSign, 0, 0);
3546             }
3547             return commonNaNToFloat16(
3548                 float32ToCommonNaN(a, status), status);
3549         }
3550         /* Infinity */
3551         if (!ieee) {
3552             float_raise(float_flag_invalid, status);
3553             return packFloat16(aSign, 0x1f, 0x3ff);
3554         }
3555         return packFloat16(aSign, 0x1f, 0);
3556     }
3557     if (aExp == 0 && aSig == 0) {
3558         return packFloat16(aSign, 0, 0);
3559     }
3560     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3561      * even if the input is denormal; however this is harmless because
3562      * the largest possible single-precision denormal is still smaller
3563      * than the smallest representable half-precision denormal, and so we
3564      * will end up ignoring aSig and returning via the "always return zero"
3565      * codepath.
3566      */
3567     aSig |= 0x00800000;
3568     aExp -= 0x71;
3569
3570     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3571 }
3572
3573 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3574 {
3575     flag aSign;
3576     int aExp;
3577     uint32_t aSig;
3578
3579     aSign = extractFloat16Sign(a);
3580     aExp = extractFloat16Exp(a);
3581     aSig = extractFloat16Frac(a);
3582
3583     if (aExp == 0x1f && ieee) {
3584         if (aSig) {
3585             return commonNaNToFloat64(
3586                 float16ToCommonNaN(a, status), status);
3587         }
3588         return packFloat64(aSign, 0x7ff, 0);
3589     }
3590     if (aExp == 0) {
3591         if (aSig == 0) {
3592             return packFloat64(aSign, 0, 0);
3593         }
3594
3595         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3596         aExp--;
3597     }
3598     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3599 }
3600
3601 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3602 {
3603     flag aSign;
3604     int aExp;
3605     uint64_t aSig;
3606     uint32_t zSig;
3607
3608     a = float64_squash_input_denormal(a, status);
3609
3610     aSig = extractFloat64Frac(a);
3611     aExp = extractFloat64Exp(a);
3612     aSign = extractFloat64Sign(a);
3613     if (aExp == 0x7FF) {
3614         if (aSig) {
3615             /* Input is a NaN */
3616             if (!ieee) {
3617                 float_raise(float_flag_invalid, status);
3618                 return packFloat16(aSign, 0, 0);
3619             }
3620             return commonNaNToFloat16(
3621                 float64ToCommonNaN(a, status), status);
3622         }
3623         /* Infinity */
3624         if (!ieee) {
3625             float_raise(float_flag_invalid, status);
3626             return packFloat16(aSign, 0x1f, 0x3ff);
3627         }
3628         return packFloat16(aSign, 0x1f, 0);
3629     }
3630     shift64RightJamming(aSig, 29, &aSig);
3631     zSig = aSig;
3632     if (aExp == 0 && zSig == 0) {
3633         return packFloat16(aSign, 0, 0);
3634     }
3635     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3636      * even if the input is denormal; however this is harmless because
3637      * the largest possible single-precision denormal is still smaller
3638      * than the smallest representable half-precision denormal, and so we
3639      * will end up ignoring aSig and returning via the "always return zero"
3640      * codepath.
3641      */
3642     zSig |= 0x00800000;
3643     aExp -= 0x3F1;
3644
3645     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3646 }
3647
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of converting the double-precision floating-point value
3650 | `a' to the extended double-precision floating-point format.  The conversion
3651 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3652 | Arithmetic.
3653 *----------------------------------------------------------------------------*/
3654
3655 floatx80 float64_to_floatx80(float64 a, float_status *status)
3656 {
3657     flag aSign;
3658     int aExp;
3659     uint64_t aSig;
3660
3661     a = float64_squash_input_denormal(a, status);
3662     aSig = extractFloat64Frac( a );
3663     aExp = extractFloat64Exp( a );
3664     aSign = extractFloat64Sign( a );
3665     if ( aExp == 0x7FF ) {
3666         if (aSig) {
3667             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3668         }
3669         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3670     }
3671     if ( aExp == 0 ) {
3672         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3673         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3674     }
3675     return
3676         packFloatx80(
3677             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3678
3679 }
3680
3681 /*----------------------------------------------------------------------------
3682 | Returns the result of converting the double-precision floating-point value
3683 | `a' to the quadruple-precision floating-point format.  The conversion is
3684 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3685 | Arithmetic.
3686 *----------------------------------------------------------------------------*/
3687
3688 float128 float64_to_float128(float64 a, float_status *status)
3689 {
3690     flag aSign;
3691     int aExp;
3692     uint64_t aSig, zSig0, zSig1;
3693
3694     a = float64_squash_input_denormal(a, status);
3695     aSig = extractFloat64Frac( a );
3696     aExp = extractFloat64Exp( a );
3697     aSign = extractFloat64Sign( a );
3698     if ( aExp == 0x7FF ) {
3699         if (aSig) {
3700             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3701         }
3702         return packFloat128( aSign, 0x7FFF, 0, 0 );
3703     }
3704     if ( aExp == 0 ) {
3705         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3706         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3707         --aExp;
3708     }
3709     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3710     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3711
3712 }
3713
3714 /*----------------------------------------------------------------------------
3715 | Rounds the double-precision floating-point value `a' to an integer, and
3716 | returns the result as a double-precision floating-point value.  The
3717 | operation is performed according to the IEC/IEEE Standard for Binary
3718 | Floating-Point Arithmetic.
3719 *----------------------------------------------------------------------------*/
3720
3721 float64 float64_round_to_int(float64 a, float_status *status)
3722 {
3723     flag aSign;
3724     int aExp;
3725     uint64_t lastBitMask, roundBitsMask;
3726     uint64_t z;
3727     a = float64_squash_input_denormal(a, status);
3728
3729     aExp = extractFloat64Exp( a );
3730     if ( 0x433 <= aExp ) {
3731         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3732             return propagateFloat64NaN(a, a, status);
3733         }
3734         return a;
3735     }
3736     if ( aExp < 0x3FF ) {
3737         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3738         status->float_exception_flags |= float_flag_inexact;
3739         aSign = extractFloat64Sign( a );
3740         switch (status->float_rounding_mode) {
3741          case float_round_nearest_even:
3742             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3743                 return packFloat64( aSign, 0x3FF, 0 );
3744             }
3745             break;
3746         case float_round_ties_away:
3747             if (aExp == 0x3FE) {
3748                 return packFloat64(aSign, 0x3ff, 0);
3749             }
3750             break;
3751          case float_round_down:
3752             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3753          case float_round_up:
3754             return make_float64(
3755             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3756         }
3757         return packFloat64( aSign, 0, 0 );
3758     }
3759     lastBitMask = 1;
3760     lastBitMask <<= 0x433 - aExp;
3761     roundBitsMask = lastBitMask - 1;
3762     z = float64_val(a);
3763     switch (status->float_rounding_mode) {
3764     case float_round_nearest_even:
3765         z += lastBitMask >> 1;
3766         if ((z & roundBitsMask) == 0) {
3767             z &= ~lastBitMask;
3768         }
3769         break;
3770     case float_round_ties_away:
3771         z += lastBitMask >> 1;
3772         break;
3773     case float_round_to_zero:
3774         break;
3775     case float_round_up:
3776         if (!extractFloat64Sign(make_float64(z))) {
3777             z += roundBitsMask;
3778         }
3779         break;
3780     case float_round_down:
3781         if (extractFloat64Sign(make_float64(z))) {
3782             z += roundBitsMask;
3783         }
3784         break;
3785     default:
3786         abort();
3787     }
3788     z &= ~ roundBitsMask;
3789     if (z != float64_val(a)) {
3790         status->float_exception_flags |= float_flag_inexact;
3791     }
3792     return make_float64(z);
3793
3794 }
3795
3796 float64 float64_trunc_to_int(float64 a, float_status *status)
3797 {
3798     int oldmode;
3799     float64 res;
3800     oldmode = status->float_rounding_mode;
3801     status->float_rounding_mode = float_round_to_zero;
3802     res = float64_round_to_int(a, status);
3803     status->float_rounding_mode = oldmode;
3804     return res;
3805 }
3806
3807 /*----------------------------------------------------------------------------
3808 | Returns the result of adding the absolute values of the double-precision
3809 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3810 | before being returned.  `zSign' is ignored if the result is a NaN.
3811 | The addition is performed according to the IEC/IEEE Standard for Binary
3812 | Floating-Point Arithmetic.
3813 *----------------------------------------------------------------------------*/
3814
3815 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3816                               float_status *status)
3817 {
3818     int aExp, bExp, zExp;
3819     uint64_t aSig, bSig, zSig;
3820     int expDiff;
3821
3822     aSig = extractFloat64Frac( a );
3823     aExp = extractFloat64Exp( a );
3824     bSig = extractFloat64Frac( b );
3825     bExp = extractFloat64Exp( b );
3826     expDiff = aExp - bExp;
3827     aSig <<= 9;
3828     bSig <<= 9;
3829     if ( 0 < expDiff ) {
3830         if ( aExp == 0x7FF ) {
3831             if (aSig) {
3832                 return propagateFloat64NaN(a, b, status);
3833             }
3834             return a;
3835         }
3836         if ( bExp == 0 ) {
3837             --expDiff;
3838         }
3839         else {
3840             bSig |= LIT64( 0x2000000000000000 );
3841         }
3842         shift64RightJamming( bSig, expDiff, &bSig );
3843         zExp = aExp;
3844     }
3845     else if ( expDiff < 0 ) {
3846         if ( bExp == 0x7FF ) {
3847             if (bSig) {
3848                 return propagateFloat64NaN(a, b, status);
3849             }
3850             return packFloat64( zSign, 0x7FF, 0 );
3851         }
3852         if ( aExp == 0 ) {
3853             ++expDiff;
3854         }
3855         else {
3856             aSig |= LIT64( 0x2000000000000000 );
3857         }
3858         shift64RightJamming( aSig, - expDiff, &aSig );
3859         zExp = bExp;
3860     }
3861     else {
3862         if ( aExp == 0x7FF ) {
3863             if (aSig | bSig) {
3864                 return propagateFloat64NaN(a, b, status);
3865             }
3866             return a;
3867         }
3868         if ( aExp == 0 ) {
3869             if (status->flush_to_zero) {
3870                 if (aSig | bSig) {
3871                     float_raise(float_flag_output_denormal, status);
3872                 }
3873                 return packFloat64(zSign, 0, 0);
3874             }
3875             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3876         }
3877         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3878         zExp = aExp;
3879         goto roundAndPack;
3880     }
3881     aSig |= LIT64( 0x2000000000000000 );
3882     zSig = ( aSig + bSig )<<1;
3883     --zExp;
3884     if ( (int64_t) zSig < 0 ) {
3885         zSig = aSig + bSig;
3886         ++zExp;
3887     }
3888  roundAndPack:
3889     return roundAndPackFloat64(zSign, zExp, zSig, status);
3890
3891 }
3892
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of subtracting the absolute values of the double-
3895 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3896 | difference is negated before being returned.  `zSign' is ignored if the
3897 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3898 | Standard for Binary Floating-Point Arithmetic.
3899 *----------------------------------------------------------------------------*/
3900
3901 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3902                               float_status *status)
3903 {
3904     int aExp, bExp, zExp;
3905     uint64_t aSig, bSig, zSig;
3906     int expDiff;
3907
3908     aSig = extractFloat64Frac( a );
3909     aExp = extractFloat64Exp( a );
3910     bSig = extractFloat64Frac( b );
3911     bExp = extractFloat64Exp( b );
3912     expDiff = aExp - bExp;
3913     aSig <<= 10;
3914     bSig <<= 10;
3915     if ( 0 < expDiff ) goto aExpBigger;
3916     if ( expDiff < 0 ) goto bExpBigger;
3917     if ( aExp == 0x7FF ) {
3918         if (aSig | bSig) {
3919             return propagateFloat64NaN(a, b, status);
3920         }
3921         float_raise(float_flag_invalid, status);
3922         return float64_default_nan(status);
3923     }
3924     if ( aExp == 0 ) {
3925         aExp = 1;
3926         bExp = 1;
3927     }
3928     if ( bSig < aSig ) goto aBigger;
3929     if ( aSig < bSig ) goto bBigger;
3930     return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3931  bExpBigger:
3932     if ( bExp == 0x7FF ) {
3933         if (bSig) {
3934             return propagateFloat64NaN(a, b, status);
3935         }
3936         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3937     }
3938     if ( aExp == 0 ) {
3939         ++expDiff;
3940     }
3941     else {
3942         aSig |= LIT64( 0x4000000000000000 );
3943     }
3944     shift64RightJamming( aSig, - expDiff, &aSig );
3945     bSig |= LIT64( 0x4000000000000000 );
3946  bBigger:
3947     zSig = bSig - aSig;
3948     zExp = bExp;
3949     zSign ^= 1;
3950     goto normalizeRoundAndPack;
3951  aExpBigger:
3952     if ( aExp == 0x7FF ) {
3953         if (aSig) {
3954             return propagateFloat64NaN(a, b, status);
3955         }
3956         return a;
3957     }
3958     if ( bExp == 0 ) {
3959         --expDiff;
3960     }
3961     else {
3962         bSig |= LIT64( 0x4000000000000000 );
3963     }
3964     shift64RightJamming( bSig, expDiff, &bSig );
3965     aSig |= LIT64( 0x4000000000000000 );
3966  aBigger:
3967     zSig = aSig - bSig;
3968     zExp = aExp;
3969  normalizeRoundAndPack:
3970     --zExp;
3971     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3972
3973 }
3974
3975 /*----------------------------------------------------------------------------
3976 | Returns the result of adding the double-precision floating-point values `a'
3977 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3978 | Binary Floating-Point Arithmetic.
3979 *----------------------------------------------------------------------------*/
3980
3981 float64 float64_add(float64 a, float64 b, float_status *status)
3982 {
3983     flag aSign, bSign;
3984     a = float64_squash_input_denormal(a, status);
3985     b = float64_squash_input_denormal(b, status);
3986
3987     aSign = extractFloat64Sign( a );
3988     bSign = extractFloat64Sign( b );
3989     if ( aSign == bSign ) {
3990         return addFloat64Sigs(a, b, aSign, status);
3991     }
3992     else {
3993         return subFloat64Sigs(a, b, aSign, status);
3994     }
3995
3996 }
3997
3998 /*----------------------------------------------------------------------------
3999 | Returns the result of subtracting the double-precision floating-point values
4000 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4001 | for Binary Floating-Point Arithmetic.
4002 *----------------------------------------------------------------------------*/
4003
4004 float64 float64_sub(float64 a, float64 b, float_status *status)
4005 {
4006     flag aSign, bSign;
4007     a = float64_squash_input_denormal(a, status);
4008     b = float64_squash_input_denormal(b, status);
4009
4010     aSign = extractFloat64Sign( a );
4011     bSign = extractFloat64Sign( b );
4012     if ( aSign == bSign ) {
4013         return subFloat64Sigs(a, b, aSign, status);
4014     }
4015     else {
4016         return addFloat64Sigs(a, b, aSign, status);
4017     }
4018
4019 }
4020
4021 /*----------------------------------------------------------------------------
4022 | Returns the result of multiplying the double-precision floating-point values
4023 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4024 | for Binary Floating-Point Arithmetic.
4025 *----------------------------------------------------------------------------*/
4026
4027 float64 float64_mul(float64 a, float64 b, float_status *status)
4028 {
4029     flag aSign, bSign, zSign;
4030     int aExp, bExp, zExp;
4031     uint64_t aSig, bSig, zSig0, zSig1;
4032
4033     a = float64_squash_input_denormal(a, status);
4034     b = float64_squash_input_denormal(b, status);
4035
4036     aSig = extractFloat64Frac( a );
4037     aExp = extractFloat64Exp( a );
4038     aSign = extractFloat64Sign( a );
4039     bSig = extractFloat64Frac( b );
4040     bExp = extractFloat64Exp( b );
4041     bSign = extractFloat64Sign( b );
4042     zSign = aSign ^ bSign;
4043     if ( aExp == 0x7FF ) {
4044         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4045             return propagateFloat64NaN(a, b, status);
4046         }
4047         if ( ( bExp | bSig ) == 0 ) {
4048             float_raise(float_flag_invalid, status);
4049             return float64_default_nan(status);
4050         }
4051         return packFloat64( zSign, 0x7FF, 0 );
4052     }
4053     if ( bExp == 0x7FF ) {
4054         if (bSig) {
4055             return propagateFloat64NaN(a, b, status);
4056         }
4057         if ( ( aExp | aSig ) == 0 ) {
4058             float_raise(float_flag_invalid, status);
4059             return float64_default_nan(status);
4060         }
4061         return packFloat64( zSign, 0x7FF, 0 );
4062     }
4063     if ( aExp == 0 ) {
4064         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4065         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4066     }
4067     if ( bExp == 0 ) {
4068         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4069         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4070     }
4071     zExp = aExp + bExp - 0x3FF;
4072     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4073     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4074     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4075     zSig0 |= ( zSig1 != 0 );
4076     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4077         zSig0 <<= 1;
4078         --zExp;
4079     }
4080     return roundAndPackFloat64(zSign, zExp, zSig0, status);
4081
4082 }
4083
4084 /*----------------------------------------------------------------------------
4085 | Returns the result of dividing the double-precision floating-point value `a'
4086 | by the corresponding value `b'.  The operation is performed according to
4087 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4088 *----------------------------------------------------------------------------*/
4089
4090 float64 float64_div(float64 a, float64 b, float_status *status)
4091 {
4092     flag aSign, bSign, zSign;
4093     int aExp, bExp, zExp;
4094     uint64_t aSig, bSig, zSig;
4095     uint64_t rem0, rem1;
4096     uint64_t term0, term1;
4097     a = float64_squash_input_denormal(a, status);
4098     b = float64_squash_input_denormal(b, status);
4099
4100     aSig = extractFloat64Frac( a );
4101     aExp = extractFloat64Exp( a );
4102     aSign = extractFloat64Sign( a );
4103     bSig = extractFloat64Frac( b );
4104     bExp = extractFloat64Exp( b );
4105     bSign = extractFloat64Sign( b );
4106     zSign = aSign ^ bSign;
4107     if ( aExp == 0x7FF ) {
4108         if (aSig) {
4109             return propagateFloat64NaN(a, b, status);
4110         }
4111         if ( bExp == 0x7FF ) {
4112             if (bSig) {
4113                 return propagateFloat64NaN(a, b, status);
4114             }
4115             float_raise(float_flag_invalid, status);
4116             return float64_default_nan(status);
4117         }
4118         return packFloat64( zSign, 0x7FF, 0 );
4119     }
4120     if ( bExp == 0x7FF ) {
4121         if (bSig) {
4122             return propagateFloat64NaN(a, b, status);
4123         }
4124         return packFloat64( zSign, 0, 0 );
4125     }
4126     if ( bExp == 0 ) {
4127         if ( bSig == 0 ) {
4128             if ( ( aExp | aSig ) == 0 ) {
4129                 float_raise(float_flag_invalid, status);
4130                 return float64_default_nan(status);
4131             }
4132             float_raise(float_flag_divbyzero, status);
4133             return packFloat64( zSign, 0x7FF, 0 );
4134         }
4135         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4136     }
4137     if ( aExp == 0 ) {
4138         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4139         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4140     }
4141     zExp = aExp - bExp + 0x3FD;
4142     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4143     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4144     if ( bSig <= ( aSig + aSig ) ) {
4145         aSig >>= 1;
4146         ++zExp;
4147     }
4148     zSig = estimateDiv128To64( aSig, 0, bSig );
4149     if ( ( zSig & 0x1FF ) <= 2 ) {
4150         mul64To128( bSig, zSig, &term0, &term1 );
4151         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4152         while ( (int64_t) rem0 < 0 ) {
4153             --zSig;
4154             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4155         }
4156         zSig |= ( rem1 != 0 );
4157     }
4158     return roundAndPackFloat64(zSign, zExp, zSig, status);
4159
4160 }
4161
4162 /*----------------------------------------------------------------------------
4163 | Returns the remainder of the double-precision floating-point value `a'
4164 | with respect to the corresponding value `b'.  The operation is performed
4165 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4166 *----------------------------------------------------------------------------*/
4167
4168 float64 float64_rem(float64 a, float64 b, float_status *status)
4169 {
4170     flag aSign, zSign;
4171     int aExp, bExp, expDiff;
4172     uint64_t aSig, bSig;
4173     uint64_t q, alternateASig;
4174     int64_t sigMean;
4175
4176     a = float64_squash_input_denormal(a, status);
4177     b = float64_squash_input_denormal(b, status);
4178     aSig = extractFloat64Frac( a );
4179     aExp = extractFloat64Exp( a );
4180     aSign = extractFloat64Sign( a );
4181     bSig = extractFloat64Frac( b );
4182     bExp = extractFloat64Exp( b );
4183     if ( aExp == 0x7FF ) {
4184         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4185             return propagateFloat64NaN(a, b, status);
4186         }
4187         float_raise(float_flag_invalid, status);
4188         return float64_default_nan(status);
4189     }
4190     if ( bExp == 0x7FF ) {
4191         if (bSig) {
4192             return propagateFloat64NaN(a, b, status);
4193         }
4194         return a;
4195     }
4196     if ( bExp == 0 ) {
4197         if ( bSig == 0 ) {
4198             float_raise(float_flag_invalid, status);
4199             return float64_default_nan(status);
4200         }
4201         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4202     }
4203     if ( aExp == 0 ) {
4204         if ( aSig == 0 ) return a;
4205         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4206     }
4207     expDiff = aExp - bExp;
4208     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4209     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4210     if ( expDiff < 0 ) {
4211         if ( expDiff < -1 ) return a;
4212         aSig >>= 1;
4213     }
4214     q = ( bSig <= aSig );
4215     if ( q ) aSig -= bSig;
4216     expDiff -= 64;
4217     while ( 0 < expDiff ) {
4218         q = estimateDiv128To64( aSig, 0, bSig );
4219         q = ( 2 < q ) ? q - 2 : 0;
4220         aSig = - ( ( bSig>>2 ) * q );
4221         expDiff -= 62;
4222     }
4223     expDiff += 64;
4224     if ( 0 < expDiff ) {
4225         q = estimateDiv128To64( aSig, 0, bSig );
4226         q = ( 2 < q ) ? q - 2 : 0;
4227         q >>= 64 - expDiff;
4228         bSig >>= 2;
4229         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4230     }
4231     else {
4232         aSig >>= 2;
4233         bSig >>= 2;
4234     }
4235     do {
4236         alternateASig = aSig;
4237         ++q;
4238         aSig -= bSig;
4239     } while ( 0 <= (int64_t) aSig );
4240     sigMean = aSig + alternateASig;
4241     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4242         aSig = alternateASig;
4243     }
4244     zSign = ( (int64_t) aSig < 0 );
4245     if ( zSign ) aSig = - aSig;
4246     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4247
4248 }
4249
4250 /*----------------------------------------------------------------------------
4251 | Returns the result of multiplying the double-precision floating-point values
4252 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4253 | multiplication.  The operation is performed according to the IEC/IEEE
4254 | Standard for Binary Floating-Point Arithmetic 754-2008.
4255 | The flags argument allows the caller to select negation of the
4256 | addend, the intermediate product, or the final result. (The difference
4257 | between this and having the caller do a separate negation is that negating
4258 | externally will flip the sign bit on NaNs.)
4259 *----------------------------------------------------------------------------*/
4260
4261 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4262                        float_status *status)
4263 {
4264     flag aSign, bSign, cSign, zSign;
4265     int aExp, bExp, cExp, pExp, zExp, expDiff;
4266     uint64_t aSig, bSig, cSig;
4267     flag pInf, pZero, pSign;
4268     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4269     int shiftcount;
4270     flag signflip, infzero;
4271
4272     a = float64_squash_input_denormal(a, status);
4273     b = float64_squash_input_denormal(b, status);
4274     c = float64_squash_input_denormal(c, status);
4275     aSig = extractFloat64Frac(a);
4276     aExp = extractFloat64Exp(a);
4277     aSign = extractFloat64Sign(a);
4278     bSig = extractFloat64Frac(b);
4279     bExp = extractFloat64Exp(b);
4280     bSign = extractFloat64Sign(b);
4281     cSig = extractFloat64Frac(c);
4282     cExp = extractFloat64Exp(c);
4283     cSign = extractFloat64Sign(c);
4284
4285     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4286                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4287
4288     /* It is implementation-defined whether the cases of (0,inf,qnan)
4289      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4290      * they return if they do), so we have to hand this information
4291      * off to the target-specific pick-a-NaN routine.
4292      */
4293     if (((aExp == 0x7ff) && aSig) ||
4294         ((bExp == 0x7ff) && bSig) ||
4295         ((cExp == 0x7ff) && cSig)) {
4296         return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4297     }
4298
4299     if (infzero) {
4300         float_raise(float_flag_invalid, status);
4301         return float64_default_nan(status);
4302     }
4303
4304     if (flags & float_muladd_negate_c) {
4305         cSign ^= 1;
4306     }
4307
4308     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4309
4310     /* Work out the sign and type of the product */
4311     pSign = aSign ^ bSign;
4312     if (flags & float_muladd_negate_product) {
4313         pSign ^= 1;
4314     }
4315     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4316     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4317
4318     if (cExp == 0x7ff) {
4319         if (pInf && (pSign ^ cSign)) {
4320             /* addition of opposite-signed infinities => InvalidOperation */
4321             float_raise(float_flag_invalid, status);
4322             return float64_default_nan(status);
4323         }
4324         /* Otherwise generate an infinity of the same sign */
4325         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4326     }
4327
4328     if (pInf) {
4329         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4330     }
4331
4332     if (pZero) {
4333         if (cExp == 0) {
4334             if (cSig == 0) {
4335                 /* Adding two exact zeroes */
4336                 if (pSign == cSign) {
4337                     zSign = pSign;
4338                 } else if (status->float_rounding_mode == float_round_down) {
4339                     zSign = 1;
4340                 } else {
4341                     zSign = 0;
4342                 }
4343                 return packFloat64(zSign ^ signflip, 0, 0);
4344             }
4345             /* Exact zero plus a denorm */
4346             if (status->flush_to_zero) {
4347                 float_raise(float_flag_output_denormal, status);
4348                 return packFloat64(cSign ^ signflip, 0, 0);
4349             }
4350         }
4351         /* Zero plus something non-zero : just return the something */
4352         if (flags & float_muladd_halve_result) {
4353             if (cExp == 0) {
4354                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4355             }
4356             /* Subtract one to halve, and one again because roundAndPackFloat64
4357              * wants one less than the true exponent.
4358              */
4359             cExp -= 2;
4360             cSig = (cSig | 0x0010000000000000ULL) << 10;
4361             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4362         }
4363         return packFloat64(cSign ^ signflip, cExp, cSig);
4364     }
4365
4366     if (aExp == 0) {
4367         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4368     }
4369     if (bExp == 0) {
4370         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4371     }
4372
4373     /* Calculate the actual result a * b + c */
4374
4375     /* Multiply first; this is easy. */
4376     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4377      * because we want the true exponent, not the "one-less-than"
4378      * flavour that roundAndPackFloat64() takes.
4379      */
4380     pExp = aExp + bExp - 0x3fe;
4381     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4382     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4383     mul64To128(aSig, bSig, &pSig0, &pSig1);
4384     if ((int64_t)(pSig0 << 1) >= 0) {
4385         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4386         pExp--;
4387     }
4388
4389     zSign = pSign ^ signflip;
4390
4391     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4392      * bit in position 126.
4393      */
4394     if (cExp == 0) {
4395         if (!cSig) {
4396             /* Throw out the special case of c being an exact zero now */
4397             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4398             if (flags & float_muladd_halve_result) {
4399                 pExp--;
4400             }
4401             return roundAndPackFloat64(zSign, pExp - 1,
4402                                        pSig1, status);
4403         }
4404         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4405     }
4406
4407     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4408      * significand of the addend, with the explicit bit in position 126.
4409      */
4410     cSig0 = cSig << (126 - 64 - 52);
4411     cSig1 = 0;
4412     cSig0 |= LIT64(0x4000000000000000);
4413     expDiff = pExp - cExp;
4414
4415     if (pSign == cSign) {
4416         /* Addition */
4417         if (expDiff > 0) {
4418             /* scale c to match p */
4419             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4420             zExp = pExp;
4421         } else if (expDiff < 0) {
4422             /* scale p to match c */
4423             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4424             zExp = cExp;
4425         } else {
4426             /* no scaling needed */
4427             zExp = cExp;
4428         }
4429         /* Add significands and make sure explicit bit ends up in posn 126 */
4430         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4431         if ((int64_t)zSig0 < 0) {
4432             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4433         } else {
4434             zExp--;
4435         }
4436         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4437         if (flags & float_muladd_halve_result) {
4438             zExp--;
4439         }
4440         return roundAndPackFloat64(zSign, zExp, zSig1, status);
4441     } else {
4442         /* Subtraction */
4443         if (expDiff > 0) {
4444             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4445             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4446             zExp = pExp;
4447         } else if (expDiff < 0) {
4448             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4449             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4450             zExp = cExp;
4451             zSign ^= 1;
4452         } else {
4453             zExp = pExp;
4454             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4455                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4456             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4457                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4458                 zSign ^= 1;
4459             } else {
4460                 /* Exact zero */
4461                 zSign = signflip;
4462                 if (status->float_rounding_mode == float_round_down) {
4463                     zSign ^= 1;
4464                 }
4465                 return packFloat64(zSign, 0, 0);
4466             }
4467         }
4468         --zExp;
4469         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4470          * starting with the significand in a pair of uint64_t.
4471          */
4472         if (zSig0) {
4473             shiftcount = countLeadingZeros64(zSig0) - 1;
4474             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4475             if (zSig1) {
4476                 zSig0 |= 1;
4477             }
4478             zExp -= shiftcount;
4479         } else {
4480             shiftcount = countLeadingZeros64(zSig1);
4481             if (shiftcount == 0) {
4482                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4483                 zExp -= 63;
4484             } else {
4485                 shiftcount--;
4486                 zSig0 = zSig1 << shiftcount;
4487                 zExp -= (shiftcount + 64);
4488             }
4489         }
4490         if (flags & float_muladd_halve_result) {
4491             zExp--;
4492         }
4493         return roundAndPackFloat64(zSign, zExp, zSig0, status);
4494     }
4495 }
4496
4497 /*----------------------------------------------------------------------------
4498 | Returns the square root of the double-precision floating-point value `a'.
4499 | The operation is performed according to the IEC/IEEE Standard for Binary
4500 | Floating-Point Arithmetic.
4501 *----------------------------------------------------------------------------*/
4502
4503 float64 float64_sqrt(float64 a, float_status *status)
4504 {
4505     flag aSign;
4506     int aExp, zExp;
4507     uint64_t aSig, zSig, doubleZSig;
4508     uint64_t rem0, rem1, term0, term1;
4509     a = float64_squash_input_denormal(a, status);
4510
4511     aSig = extractFloat64Frac( a );
4512     aExp = extractFloat64Exp( a );
4513     aSign = extractFloat64Sign( a );
4514     if ( aExp == 0x7FF ) {
4515         if (aSig) {
4516             return propagateFloat64NaN(a, a, status);
4517         }
4518         if ( ! aSign ) return a;
4519         float_raise(float_flag_invalid, status);
4520         return float64_default_nan(status);
4521     }
4522     if ( aSign ) {
4523         if ( ( aExp | aSig ) == 0 ) return a;
4524         float_raise(float_flag_invalid, status);
4525         return float64_default_nan(status);
4526     }
4527     if ( aExp == 0 ) {
4528         if ( aSig == 0 ) return float64_zero;
4529         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4530     }
4531     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4532     aSig |= LIT64( 0x0010000000000000 );
4533     zSig = estimateSqrt32( aExp, aSig>>21 );
4534     aSig <<= 9 - ( aExp & 1 );
4535     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4536     if ( ( zSig & 0x1FF ) <= 5 ) {
4537         doubleZSig = zSig<<1;
4538         mul64To128( zSig, zSig, &term0, &term1 );
4539         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4540         while ( (int64_t) rem0 < 0 ) {
4541             --zSig;
4542             doubleZSig -= 2;
4543             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4544         }
4545         zSig |= ( ( rem0 | rem1 ) != 0 );
4546     }
4547     return roundAndPackFloat64(0, zExp, zSig, status);
4548
4549 }
4550
4551 /*----------------------------------------------------------------------------
4552 | Returns the binary log of the double-precision floating-point value `a'.
4553 | The operation is performed according to the IEC/IEEE Standard for Binary
4554 | Floating-Point Arithmetic.
4555 *----------------------------------------------------------------------------*/
4556 float64 float64_log2(float64 a, float_status *status)
4557 {
4558     flag aSign, zSign;
4559     int aExp;
4560     uint64_t aSig, aSig0, aSig1, zSig, i;
4561     a = float64_squash_input_denormal(a, status);
4562
4563     aSig = extractFloat64Frac( a );
4564     aExp = extractFloat64Exp( a );
4565     aSign = extractFloat64Sign( a );
4566
4567     if ( aExp == 0 ) {
4568         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4569         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4570     }
4571     if ( aSign ) {
4572         float_raise(float_flag_invalid, status);
4573         return float64_default_nan(status);
4574     }
4575     if ( aExp == 0x7FF ) {
4576         if (aSig) {
4577             return propagateFloat64NaN(a, float64_zero, status);
4578         }
4579         return a;
4580     }
4581
4582     aExp -= 0x3FF;
4583     aSig |= LIT64( 0x0010000000000000 );
4584     zSign = aExp < 0;
4585     zSig = (uint64_t)aExp << 52;
4586     for (i = 1LL << 51; i > 0; i >>= 1) {
4587         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4588         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4589         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4590             aSig >>= 1;
4591             zSig |= i;
4592         }
4593     }
4594
4595     if ( zSign )
4596         zSig = -zSig;
4597     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4598 }
4599
4600 /*----------------------------------------------------------------------------
4601 | Returns 1 if the double-precision floating-point value `a' is equal to the
4602 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4603 | if either operand is a NaN.  Otherwise, the comparison is performed
4604 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4605 *----------------------------------------------------------------------------*/
4606
4607 int float64_eq(float64 a, float64 b, float_status *status)
4608 {
4609     uint64_t av, bv;
4610     a = float64_squash_input_denormal(a, status);
4611     b = float64_squash_input_denormal(b, status);
4612
4613     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4614          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4615        ) {
4616         float_raise(float_flag_invalid, status);
4617         return 0;
4618     }
4619     av = float64_val(a);
4620     bv = float64_val(b);
4621     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4622
4623 }
4624
4625 /*----------------------------------------------------------------------------
4626 | Returns 1 if the double-precision floating-point value `a' is less than or
4627 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4628 | exception is raised if either operand is a NaN.  The comparison is performed
4629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4631
4632 int float64_le(float64 a, float64 b, float_status *status)
4633 {
4634     flag aSign, bSign;
4635     uint64_t av, bv;
4636     a = float64_squash_input_denormal(a, status);
4637     b = float64_squash_input_denormal(b, status);
4638
4639     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4640          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4641        ) {
4642         float_raise(float_flag_invalid, status);
4643         return 0;
4644     }
4645     aSign = extractFloat64Sign( a );
4646     bSign = extractFloat64Sign( b );
4647     av = float64_val(a);
4648     bv = float64_val(b);
4649     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4650     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4651
4652 }
4653
4654 /*----------------------------------------------------------------------------
4655 | Returns 1 if the double-precision floating-point value `a' is less than
4656 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4657 | raised if either operand is a NaN.  The comparison is performed according
4658 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4659 *----------------------------------------------------------------------------*/
4660
4661 int float64_lt(float64 a, float64 b, float_status *status)
4662 {
4663     flag aSign, bSign;
4664     uint64_t av, bv;
4665
4666     a = float64_squash_input_denormal(a, status);
4667     b = float64_squash_input_denormal(b, status);
4668     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4669          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4670        ) {
4671         float_raise(float_flag_invalid, status);
4672         return 0;
4673     }
4674     aSign = extractFloat64Sign( a );
4675     bSign = extractFloat64Sign( b );
4676     av = float64_val(a);
4677     bv = float64_val(b);
4678     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4679     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4680
4681 }
4682
4683 /*----------------------------------------------------------------------------
4684 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4685 | be compared, and 0 otherwise.  The invalid exception is raised if either
4686 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4687 | Standard for Binary Floating-Point Arithmetic.
4688 *----------------------------------------------------------------------------*/
4689
4690 int float64_unordered(float64 a, float64 b, float_status *status)
4691 {
4692     a = float64_squash_input_denormal(a, status);
4693     b = float64_squash_input_denormal(b, status);
4694
4695     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4696          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4697        ) {
4698         float_raise(float_flag_invalid, status);
4699         return 1;
4700     }
4701     return 0;
4702 }
4703
4704 /*----------------------------------------------------------------------------
4705 | Returns 1 if the double-precision floating-point value `a' is equal to the
4706 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4707 | exception.The comparison is performed according to the IEC/IEEE Standard
4708 | for Binary Floating-Point Arithmetic.
4709 *----------------------------------------------------------------------------*/
4710
4711 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4712 {
4713     uint64_t av, bv;
4714     a = float64_squash_input_denormal(a, status);
4715     b = float64_squash_input_denormal(b, status);
4716
4717     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4718          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4719        ) {
4720         if (float64_is_signaling_nan(a, status)
4721          || float64_is_signaling_nan(b, status)) {
4722             float_raise(float_flag_invalid, status);
4723         }
4724         return 0;
4725     }
4726     av = float64_val(a);
4727     bv = float64_val(b);
4728     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4729
4730 }
4731
4732 /*----------------------------------------------------------------------------
4733 | Returns 1 if the double-precision floating-point value `a' is less than or
4734 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4735 | cause an exception.  Otherwise, the comparison is performed according to the
4736 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4737 *----------------------------------------------------------------------------*/
4738
4739 int float64_le_quiet(float64 a, float64 b, float_status *status)
4740 {
4741     flag aSign, bSign;
4742     uint64_t av, bv;
4743     a = float64_squash_input_denormal(a, status);
4744     b = float64_squash_input_denormal(b, status);
4745
4746     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4747          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4748        ) {
4749         if (float64_is_signaling_nan(a, status)
4750          || float64_is_signaling_nan(b, status)) {
4751             float_raise(float_flag_invalid, status);
4752         }
4753         return 0;
4754     }
4755     aSign = extractFloat64Sign( a );
4756     bSign = extractFloat64Sign( b );
4757     av = float64_val(a);
4758     bv = float64_val(b);
4759     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4760     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4761
4762 }
4763
4764 /*----------------------------------------------------------------------------
4765 | Returns 1 if the double-precision floating-point value `a' is less than
4766 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4767 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4768 | Standard for Binary Floating-Point Arithmetic.
4769 *----------------------------------------------------------------------------*/
4770
4771 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4772 {
4773     flag aSign, bSign;
4774     uint64_t av, bv;
4775     a = float64_squash_input_denormal(a, status);
4776     b = float64_squash_input_denormal(b, status);
4777
4778     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4779          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4780        ) {
4781         if (float64_is_signaling_nan(a, status)
4782          || float64_is_signaling_nan(b, status)) {
4783             float_raise(float_flag_invalid, status);
4784         }
4785         return 0;
4786     }
4787     aSign = extractFloat64Sign( a );
4788     bSign = extractFloat64Sign( b );
4789     av = float64_val(a);
4790     bv = float64_val(b);
4791     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4792     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4793
4794 }
4795
4796 /*----------------------------------------------------------------------------
4797 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4798 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4799 | comparison is performed according to the IEC/IEEE Standard for Binary
4800 | Floating-Point Arithmetic.
4801 *----------------------------------------------------------------------------*/
4802
4803 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4804 {
4805     a = float64_squash_input_denormal(a, status);
4806     b = float64_squash_input_denormal(b, status);
4807
4808     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4809          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4810        ) {
4811         if (float64_is_signaling_nan(a, status)
4812          || float64_is_signaling_nan(b, status)) {
4813             float_raise(float_flag_invalid, status);
4814         }
4815         return 1;
4816     }
4817     return 0;
4818 }
4819
4820 /*----------------------------------------------------------------------------
4821 | Returns the result of converting the extended double-precision floating-
4822 | point value `a' to the 32-bit two's complement integer format.  The
4823 | conversion is performed according to the IEC/IEEE Standard for Binary
4824 | Floating-Point Arithmetic---which means in particular that the conversion
4825 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4826 | largest positive integer is returned.  Otherwise, if the conversion
4827 | overflows, the largest integer with the same sign as `a' is returned.
4828 *----------------------------------------------------------------------------*/
4829
4830 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4831 {
4832     flag aSign;
4833     int32_t aExp, shiftCount;
4834     uint64_t aSig;
4835
4836     if (floatx80_invalid_encoding(a)) {
4837         float_raise(float_flag_invalid, status);
4838         return 1 << 31;
4839     }
4840     aSig = extractFloatx80Frac( a );
4841     aExp = extractFloatx80Exp( a );
4842     aSign = extractFloatx80Sign( a );
4843     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4844     shiftCount = 0x4037 - aExp;
4845     if ( shiftCount <= 0 ) shiftCount = 1;
4846     shift64RightJamming( aSig, shiftCount, &aSig );
4847     return roundAndPackInt32(aSign, aSig, status);
4848
4849 }
4850
4851 /*----------------------------------------------------------------------------
4852 | Returns the result of converting the extended double-precision floating-
4853 | point value `a' to the 32-bit two's complement integer format.  The
4854 | conversion is performed according to the IEC/IEEE Standard for Binary
4855 | Floating-Point Arithmetic, except that the conversion is always rounded
4856 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4857 | Otherwise, if the conversion overflows, the largest integer with the same
4858 | sign as `a' is returned.
4859 *----------------------------------------------------------------------------*/
4860
4861 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4862 {
4863     flag aSign;
4864     int32_t aExp, shiftCount;
4865     uint64_t aSig, savedASig;
4866     int32_t z;
4867
4868     if (floatx80_invalid_encoding(a)) {
4869         float_raise(float_flag_invalid, status);
4870         return 1 << 31;
4871     }
4872     aSig = extractFloatx80Frac( a );
4873     aExp = extractFloatx80Exp( a );
4874     aSign = extractFloatx80Sign( a );
4875     if ( 0x401E < aExp ) {
4876         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4877         goto invalid;
4878     }
4879     else if ( aExp < 0x3FFF ) {
4880         if (aExp || aSig) {
4881             status->float_exception_flags |= float_flag_inexact;
4882         }
4883         return 0;
4884     }
4885     shiftCount = 0x403E - aExp;
4886     savedASig = aSig;
4887     aSig >>= shiftCount;
4888     z = aSig;
4889     if ( aSign ) z = - z;
4890     if ( ( z < 0 ) ^ aSign ) {
4891  invalid:
4892         float_raise(float_flag_invalid, status);
4893         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4894     }
4895     if ( ( aSig<<shiftCount ) != savedASig ) {
4896         status->float_exception_flags |= float_flag_inexact;
4897     }
4898     return z;
4899
4900 }
4901
4902 /*----------------------------------------------------------------------------
4903 | Returns the result of converting the extended double-precision floating-
4904 | point value `a' to the 64-bit two's complement integer format.  The
4905 | conversion is performed according to the IEC/IEEE Standard for Binary
4906 | Floating-Point Arithmetic---which means in particular that the conversion
4907 | is rounded according to the current rounding mode.  If `a' is a NaN,
4908 | the largest positive integer is returned.  Otherwise, if the conversion
4909 | overflows, the largest integer with the same sign as `a' is returned.
4910 *----------------------------------------------------------------------------*/
4911
4912 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4913 {
4914     flag aSign;
4915     int32_t aExp, shiftCount;
4916     uint64_t aSig, aSigExtra;
4917
4918     if (floatx80_invalid_encoding(a)) {
4919         float_raise(float_flag_invalid, status);
4920         return 1ULL << 63;
4921     }
4922     aSig = extractFloatx80Frac( a );
4923     aExp = extractFloatx80Exp( a );
4924     aSign = extractFloatx80Sign( a );
4925     shiftCount = 0x403E - aExp;
4926     if ( shiftCount <= 0 ) {
4927         if ( shiftCount ) {
4928             float_raise(float_flag_invalid, status);
4929             if (    ! aSign
4930                  || (    ( aExp == 0x7FFF )
4931                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4932                ) {
4933                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4934             }
4935             return (int64_t) LIT64( 0x8000000000000000 );
4936         }
4937         aSigExtra = 0;
4938     }
4939     else {
4940         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4941     }
4942     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4943
4944 }
4945
4946 /*----------------------------------------------------------------------------
4947 | Returns the result of converting the extended double-precision floating-
4948 | point value `a' to the 64-bit two's complement integer format.  The
4949 | conversion is performed according to the IEC/IEEE Standard for Binary
4950 | Floating-Point Arithmetic, except that the conversion is always rounded
4951 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4952 | Otherwise, if the conversion overflows, the largest integer with the same
4953 | sign as `a' is returned.
4954 *----------------------------------------------------------------------------*/
4955
4956 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4957 {
4958     flag aSign;
4959     int32_t aExp, shiftCount;
4960     uint64_t aSig;
4961     int64_t z;
4962
4963     if (floatx80_invalid_encoding(a)) {
4964         float_raise(float_flag_invalid, status);
4965         return 1ULL << 63;
4966     }
4967     aSig = extractFloatx80Frac( a );
4968     aExp = extractFloatx80Exp( a );
4969     aSign = extractFloatx80Sign( a );
4970     shiftCount = aExp - 0x403E;
4971     if ( 0 <= shiftCount ) {
4972         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4973         if ( ( a.high != 0xC03E ) || aSig ) {
4974             float_raise(float_flag_invalid, status);
4975             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4976                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4977             }
4978         }
4979         return (int64_t) LIT64( 0x8000000000000000 );
4980     }
4981     else if ( aExp < 0x3FFF ) {
4982         if (aExp | aSig) {
4983             status->float_exception_flags |= float_flag_inexact;
4984         }
4985         return 0;
4986     }
4987     z = aSig>>( - shiftCount );
4988     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4989         status->float_exception_flags |= float_flag_inexact;
4990     }
4991     if ( aSign ) z = - z;
4992     return z;
4993
4994 }
4995
4996 /*----------------------------------------------------------------------------
4997 | Returns the result of converting the extended double-precision floating-
4998 | point value `a' to the single-precision floating-point format.  The
4999 | conversion is performed according to the IEC/IEEE Standard for Binary
5000 | Floating-Point Arithmetic.
5001 *----------------------------------------------------------------------------*/
5002
5003 float32 floatx80_to_float32(floatx80 a, float_status *status)
5004 {
5005     flag aSign;
5006     int32_t aExp;
5007     uint64_t aSig;
5008
5009     if (floatx80_invalid_encoding(a)) {
5010         float_raise(float_flag_invalid, status);
5011         return float32_default_nan(status);
5012     }
5013     aSig = extractFloatx80Frac( a );
5014     aExp = extractFloatx80Exp( a );
5015     aSign = extractFloatx80Sign( a );
5016     if ( aExp == 0x7FFF ) {
5017         if ( (uint64_t) ( aSig<<1 ) ) {
5018             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5019         }
5020         return packFloat32( aSign, 0xFF, 0 );
5021     }
5022     shift64RightJamming( aSig, 33, &aSig );
5023     if ( aExp || aSig ) aExp -= 0x3F81;
5024     return roundAndPackFloat32(aSign, aExp, aSig, status);
5025
5026 }
5027
5028 /*----------------------------------------------------------------------------
5029 | Returns the result of converting the extended double-precision floating-
5030 | point value `a' to the double-precision floating-point format.  The
5031 | conversion is performed according to the IEC/IEEE Standard for Binary
5032 | Floating-Point Arithmetic.
5033 *----------------------------------------------------------------------------*/
5034
5035 float64 floatx80_to_float64(floatx80 a, float_status *status)
5036 {
5037     flag aSign;
5038     int32_t aExp;
5039     uint64_t aSig, zSig;
5040
5041     if (floatx80_invalid_encoding(a)) {
5042         float_raise(float_flag_invalid, status);
5043         return float64_default_nan(status);
5044     }
5045     aSig = extractFloatx80Frac( a );
5046     aExp = extractFloatx80Exp( a );
5047     aSign = extractFloatx80Sign( a );
5048     if ( aExp == 0x7FFF ) {
5049         if ( (uint64_t) ( aSig<<1 ) ) {
5050             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5051         }
5052         return packFloat64( aSign, 0x7FF, 0 );
5053     }
5054     shift64RightJamming( aSig, 1, &zSig );
5055     if ( aExp || aSig ) aExp -= 0x3C01;
5056     return roundAndPackFloat64(aSign, aExp, zSig, status);
5057
5058 }
5059
5060 /*----------------------------------------------------------------------------
5061 | Returns the result of converting the extended double-precision floating-
5062 | point value `a' to the quadruple-precision floating-point format.  The
5063 | conversion is performed according to the IEC/IEEE Standard for Binary
5064 | Floating-Point Arithmetic.
5065 *----------------------------------------------------------------------------*/
5066
5067 float128 floatx80_to_float128(floatx80 a, float_status *status)
5068 {
5069     flag aSign;
5070     int aExp;
5071     uint64_t aSig, zSig0, zSig1;
5072
5073     if (floatx80_invalid_encoding(a)) {
5074         float_raise(float_flag_invalid, status);
5075         return float128_default_nan(status);
5076     }
5077     aSig = extractFloatx80Frac( a );
5078     aExp = extractFloatx80Exp( a );
5079     aSign = extractFloatx80Sign( a );
5080     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5081         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5082     }
5083     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5084     return packFloat128( aSign, aExp, zSig0, zSig1 );
5085
5086 }
5087
5088 /*----------------------------------------------------------------------------
5089 | Rounds the extended double-precision floating-point value `a' to an integer,
5090 | and returns the result as an extended quadruple-precision floating-point
5091 | value.  The operation is performed according to the IEC/IEEE Standard for
5092 | Binary Floating-Point Arithmetic.
5093 *----------------------------------------------------------------------------*/
5094
5095 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5096 {
5097     flag aSign;
5098     int32_t aExp;
5099     uint64_t lastBitMask, roundBitsMask;
5100     floatx80 z;
5101
5102     if (floatx80_invalid_encoding(a)) {
5103         float_raise(float_flag_invalid, status);
5104         return floatx80_default_nan(status);
5105     }
5106     aExp = extractFloatx80Exp( a );
5107     if ( 0x403E <= aExp ) {
5108         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5109             return propagateFloatx80NaN(a, a, status);
5110         }
5111         return a;
5112     }
5113     if ( aExp < 0x3FFF ) {
5114         if (    ( aExp == 0 )
5115              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5116             return a;
5117         }
5118         status->float_exception_flags |= float_flag_inexact;
5119         aSign = extractFloatx80Sign( a );
5120         switch (status->float_rounding_mode) {
5121          case float_round_nearest_even:
5122             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5123                ) {
5124                 return
5125                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5126             }
5127             break;
5128         case float_round_ties_away:
5129             if (aExp == 0x3FFE) {
5130                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5131             }
5132             break;
5133          case float_round_down:
5134             return
5135                   aSign ?
5136                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5137                 : packFloatx80( 0, 0, 0 );
5138          case float_round_up:
5139             return
5140                   aSign ? packFloatx80( 1, 0, 0 )
5141                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5142         }
5143         return packFloatx80( aSign, 0, 0 );
5144     }
5145     lastBitMask = 1;
5146     lastBitMask <<= 0x403E - aExp;
5147     roundBitsMask = lastBitMask - 1;
5148     z = a;
5149     switch (status->float_rounding_mode) {
5150     case float_round_nearest_even:
5151         z.low += lastBitMask>>1;
5152         if ((z.low & roundBitsMask) == 0) {
5153             z.low &= ~lastBitMask;
5154         }
5155         break;
5156     case float_round_ties_away:
5157         z.low += lastBitMask >> 1;
5158         break;
5159     case float_round_to_zero:
5160         break;
5161     case float_round_up:
5162         if (!extractFloatx80Sign(z)) {
5163             z.low += roundBitsMask;
5164         }
5165         break;
5166     case float_round_down:
5167         if (extractFloatx80Sign(z)) {
5168             z.low += roundBitsMask;
5169         }
5170         break;
5171     default:
5172         abort();
5173     }
5174     z.low &= ~ roundBitsMask;
5175     if ( z.low == 0 ) {
5176         ++z.high;
5177         z.low = LIT64( 0x8000000000000000 );
5178     }
5179     if (z.low != a.low) {
5180         status->float_exception_flags |= float_flag_inexact;
5181     }
5182     return z;
5183
5184 }
5185
5186 /*----------------------------------------------------------------------------
5187 | Returns the result of adding the absolute values of the extended double-
5188 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5189 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5190 | The addition is performed according to the IEC/IEEE Standard for Binary
5191 | Floating-Point Arithmetic.
5192 *----------------------------------------------------------------------------*/
5193
5194 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5195                                 float_status *status)
5196 {
5197     int32_t aExp, bExp, zExp;
5198     uint64_t aSig, bSig, zSig0, zSig1;
5199     int32_t expDiff;
5200
5201     aSig = extractFloatx80Frac( a );
5202     aExp = extractFloatx80Exp( a );
5203     bSig = extractFloatx80Frac( b );
5204     bExp = extractFloatx80Exp( b );
5205     expDiff = aExp - bExp;
5206     if ( 0 < expDiff ) {
5207         if ( aExp == 0x7FFF ) {
5208             if ((uint64_t)(aSig << 1)) {
5209                 return propagateFloatx80NaN(a, b, status);
5210             }
5211             return a;
5212         }
5213         if ( bExp == 0 ) --expDiff;
5214         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5215         zExp = aExp;
5216     }
5217     else if ( expDiff < 0 ) {
5218         if ( bExp == 0x7FFF ) {
5219             if ((uint64_t)(bSig << 1)) {
5220                 return propagateFloatx80NaN(a, b, status);
5221             }
5222             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5223         }
5224         if ( aExp == 0 ) ++expDiff;
5225         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5226         zExp = bExp;
5227     }
5228     else {
5229         if ( aExp == 0x7FFF ) {
5230             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5231                 return propagateFloatx80NaN(a, b, status);
5232             }
5233             return a;
5234         }
5235         zSig1 = 0;
5236         zSig0 = aSig + bSig;
5237         if ( aExp == 0 ) {
5238             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5239             goto roundAndPack;
5240         }
5241         zExp = aExp;
5242         goto shiftRight1;
5243     }
5244     zSig0 = aSig + bSig;
5245     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5246  shiftRight1:
5247     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5248     zSig0 |= LIT64( 0x8000000000000000 );
5249     ++zExp;
5250  roundAndPack:
5251     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5252                                 zSign, zExp, zSig0, zSig1, status);
5253 }
5254
5255 /*----------------------------------------------------------------------------
5256 | Returns the result of subtracting the absolute values of the extended
5257 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5258 | difference is negated before being returned.  `zSign' is ignored if the
5259 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5260 | Standard for Binary Floating-Point Arithmetic.
5261 *----------------------------------------------------------------------------*/
5262
5263 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5264                                 float_status *status)
5265 {
5266     int32_t aExp, bExp, zExp;
5267     uint64_t aSig, bSig, zSig0, zSig1;
5268     int32_t expDiff;
5269
5270     aSig = extractFloatx80Frac( a );
5271     aExp = extractFloatx80Exp( a );
5272     bSig = extractFloatx80Frac( b );
5273     bExp = extractFloatx80Exp( b );
5274     expDiff = aExp - bExp;
5275     if ( 0 < expDiff ) goto aExpBigger;
5276     if ( expDiff < 0 ) goto bExpBigger;
5277     if ( aExp == 0x7FFF ) {
5278         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5279             return propagateFloatx80NaN(a, b, status);
5280         }
5281         float_raise(float_flag_invalid, status);
5282         return floatx80_default_nan(status);
5283     }
5284     if ( aExp == 0 ) {
5285         aExp = 1;
5286         bExp = 1;
5287     }
5288     zSig1 = 0;
5289     if ( bSig < aSig ) goto aBigger;
5290     if ( aSig < bSig ) goto bBigger;
5291     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5292  bExpBigger:
5293     if ( bExp == 0x7FFF ) {
5294         if ((uint64_t)(bSig << 1)) {
5295             return propagateFloatx80NaN(a, b, status);
5296         }
5297         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5298     }
5299     if ( aExp == 0 ) ++expDiff;
5300     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5301  bBigger:
5302     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5303     zExp = bExp;
5304     zSign ^= 1;
5305     goto normalizeRoundAndPack;
5306  aExpBigger:
5307     if ( aExp == 0x7FFF ) {
5308         if ((uint64_t)(aSig << 1)) {
5309             return propagateFloatx80NaN(a, b, status);
5310         }
5311         return a;
5312     }
5313     if ( bExp == 0 ) --expDiff;
5314     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5315  aBigger:
5316     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5317     zExp = aExp;
5318  normalizeRoundAndPack:
5319     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5320                                          zSign, zExp, zSig0, zSig1, status);
5321 }
5322
5323 /*----------------------------------------------------------------------------
5324 | Returns the result of adding the extended double-precision floating-point
5325 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5326 | Standard for Binary Floating-Point Arithmetic.
5327 *----------------------------------------------------------------------------*/
5328
5329 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5330 {
5331     flag aSign, bSign;
5332
5333     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5334         float_raise(float_flag_invalid, status);
5335         return floatx80_default_nan(status);
5336     }
5337     aSign = extractFloatx80Sign( a );
5338     bSign = extractFloatx80Sign( b );
5339     if ( aSign == bSign ) {
5340         return addFloatx80Sigs(a, b, aSign, status);
5341     }
5342     else {
5343         return subFloatx80Sigs(a, b, aSign, status);
5344     }
5345
5346 }
5347
5348 /*----------------------------------------------------------------------------
5349 | Returns the result of subtracting the extended double-precision floating-
5350 | point values `a' and `b'.  The operation is performed according to the
5351 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5352 *----------------------------------------------------------------------------*/
5353
5354 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5355 {
5356     flag aSign, bSign;
5357
5358     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5359         float_raise(float_flag_invalid, status);
5360         return floatx80_default_nan(status);
5361     }
5362     aSign = extractFloatx80Sign( a );
5363     bSign = extractFloatx80Sign( b );
5364     if ( aSign == bSign ) {
5365         return subFloatx80Sigs(a, b, aSign, status);
5366     }
5367     else {
5368         return addFloatx80Sigs(a, b, aSign, status);
5369     }
5370
5371 }
5372
5373 /*----------------------------------------------------------------------------
5374 | Returns the result of multiplying the extended double-precision floating-
5375 | point values `a' and `b'.  The operation is performed according to the
5376 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5377 *----------------------------------------------------------------------------*/
5378
5379 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5380 {
5381     flag aSign, bSign, zSign;
5382     int32_t aExp, bExp, zExp;
5383     uint64_t aSig, bSig, zSig0, zSig1;
5384
5385     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5386         float_raise(float_flag_invalid, status);
5387         return floatx80_default_nan(status);
5388     }
5389     aSig = extractFloatx80Frac( a );
5390     aExp = extractFloatx80Exp( a );
5391     aSign = extractFloatx80Sign( a );
5392     bSig = extractFloatx80Frac( b );
5393     bExp = extractFloatx80Exp( b );
5394     bSign = extractFloatx80Sign( b );
5395     zSign = aSign ^ bSign;
5396     if ( aExp == 0x7FFF ) {
5397         if (    (uint64_t) ( aSig<<1 )
5398              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5399             return propagateFloatx80NaN(a, b, status);
5400         }
5401         if ( ( bExp | bSig ) == 0 ) goto invalid;
5402         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5403     }
5404     if ( bExp == 0x7FFF ) {
5405         if ((uint64_t)(bSig << 1)) {
5406             return propagateFloatx80NaN(a, b, status);
5407         }
5408         if ( ( aExp | aSig ) == 0 ) {
5409  invalid:
5410             float_raise(float_flag_invalid, status);
5411             return floatx80_default_nan(status);
5412         }
5413         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5414     }
5415     if ( aExp == 0 ) {
5416         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5417         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5418     }
5419     if ( bExp == 0 ) {
5420         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5421         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5422     }
5423     zExp = aExp + bExp - 0x3FFE;
5424     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5425     if ( 0 < (int64_t) zSig0 ) {
5426         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5427         --zExp;
5428     }
5429     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5430                                 zSign, zExp, zSig0, zSig1, status);
5431 }
5432
5433 /*----------------------------------------------------------------------------
5434 | Returns the result of dividing the extended double-precision floating-point
5435 | value `a' by the corresponding value `b'.  The operation is performed
5436 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5437 *----------------------------------------------------------------------------*/
5438
5439 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5440 {
5441     flag aSign, bSign, zSign;
5442     int32_t aExp, bExp, zExp;
5443     uint64_t aSig, bSig, zSig0, zSig1;
5444     uint64_t rem0, rem1, rem2, term0, term1, term2;
5445
5446     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5447         float_raise(float_flag_invalid, status);
5448         return floatx80_default_nan(status);
5449     }
5450     aSig = extractFloatx80Frac( a );
5451     aExp = extractFloatx80Exp( a );
5452     aSign = extractFloatx80Sign( a );
5453     bSig = extractFloatx80Frac( b );
5454     bExp = extractFloatx80Exp( b );
5455     bSign = extractFloatx80Sign( b );
5456     zSign = aSign ^ bSign;
5457     if ( aExp == 0x7FFF ) {
5458         if ((uint64_t)(aSig << 1)) {
5459             return propagateFloatx80NaN(a, b, status);
5460         }
5461         if ( bExp == 0x7FFF ) {
5462             if ((uint64_t)(bSig << 1)) {
5463                 return propagateFloatx80NaN(a, b, status);
5464             }
5465             goto invalid;
5466         }
5467         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5468     }
5469     if ( bExp == 0x7FFF ) {
5470         if ((uint64_t)(bSig << 1)) {
5471             return propagateFloatx80NaN(a, b, status);
5472         }
5473         return packFloatx80( zSign, 0, 0 );
5474     }
5475     if ( bExp == 0 ) {
5476         if ( bSig == 0 ) {
5477             if ( ( aExp | aSig ) == 0 ) {
5478  invalid:
5479                 float_raise(float_flag_invalid, status);
5480                 return floatx80_default_nan(status);
5481             }
5482             float_raise(float_flag_divbyzero, status);
5483             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5484         }
5485         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5486     }
5487     if ( aExp == 0 ) {
5488         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5489         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5490     }
5491     zExp = aExp - bExp + 0x3FFE;
5492     rem1 = 0;
5493     if ( bSig <= aSig ) {
5494         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5495         ++zExp;
5496     }
5497     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5498     mul64To128( bSig, zSig0, &term0, &term1 );
5499     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5500     while ( (int64_t) rem0 < 0 ) {
5501         --zSig0;
5502         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5503     }
5504     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5505     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5506         mul64To128( bSig, zSig1, &term1, &term2 );
5507         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5508         while ( (int64_t) rem1 < 0 ) {
5509             --zSig1;
5510             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5511         }
5512         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5513     }
5514     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5515                                 zSign, zExp, zSig0, zSig1, status);
5516 }
5517
5518 /*----------------------------------------------------------------------------
5519 | Returns the remainder of the extended double-precision floating-point value
5520 | `a' with respect to the corresponding value `b'.  The operation is performed
5521 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5522 *----------------------------------------------------------------------------*/
5523
5524 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5525 {
5526     flag aSign, zSign;
5527     int32_t aExp, bExp, expDiff;
5528     uint64_t aSig0, aSig1, bSig;
5529     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5530
5531     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5532         float_raise(float_flag_invalid, status);
5533         return floatx80_default_nan(status);
5534     }
5535     aSig0 = extractFloatx80Frac( a );
5536     aExp = extractFloatx80Exp( a );
5537     aSign = extractFloatx80Sign( a );
5538     bSig = extractFloatx80Frac( b );
5539     bExp = extractFloatx80Exp( b );
5540     if ( aExp == 0x7FFF ) {
5541         if (    (uint64_t) ( aSig0<<1 )
5542              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5543             return propagateFloatx80NaN(a, b, status);
5544         }
5545         goto invalid;
5546     }
5547     if ( bExp == 0x7FFF ) {
5548         if ((uint64_t)(bSig << 1)) {
5549             return propagateFloatx80NaN(a, b, status);
5550         }
5551         return a;
5552     }
5553     if ( bExp == 0 ) {
5554         if ( bSig == 0 ) {
5555  invalid:
5556             float_raise(float_flag_invalid, status);
5557             return floatx80_default_nan(status);
5558         }
5559         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5560     }
5561     if ( aExp == 0 ) {
5562         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5563         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5564     }
5565     bSig |= LIT64( 0x8000000000000000 );
5566     zSign = aSign;
5567     expDiff = aExp - bExp;
5568     aSig1 = 0;
5569     if ( expDiff < 0 ) {
5570         if ( expDiff < -1 ) return a;
5571         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5572         expDiff = 0;
5573     }
5574     q = ( bSig <= aSig0 );
5575     if ( q ) aSig0 -= bSig;
5576     expDiff -= 64;
5577     while ( 0 < expDiff ) {
5578         q = estimateDiv128To64( aSig0, aSig1, bSig );
5579         q = ( 2 < q ) ? q - 2 : 0;
5580         mul64To128( bSig, q, &term0, &term1 );
5581         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5582         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5583         expDiff -= 62;
5584     }
5585     expDiff += 64;
5586     if ( 0 < expDiff ) {
5587         q = estimateDiv128To64( aSig0, aSig1, bSig );
5588         q = ( 2 < q ) ? q - 2 : 0;
5589         q >>= 64 - expDiff;
5590         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5591         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5592         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5593         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5594             ++q;
5595             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5596         }
5597     }
5598     else {
5599         term1 = 0;
5600         term0 = bSig;
5601     }
5602     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5603     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5604          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5605               && ( q & 1 ) )
5606        ) {
5607         aSig0 = alternateASig0;
5608         aSig1 = alternateASig1;
5609         zSign = ! zSign;
5610     }
5611     return
5612         normalizeRoundAndPackFloatx80(
5613             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5614
5615 }
5616
5617 /*----------------------------------------------------------------------------
5618 | Returns the square root of the extended double-precision floating-point
5619 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5620 | for Binary Floating-Point Arithmetic.
5621 *----------------------------------------------------------------------------*/
5622
5623 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5624 {
5625     flag aSign;
5626     int32_t aExp, zExp;
5627     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5628     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5629
5630     if (floatx80_invalid_encoding(a)) {
5631         float_raise(float_flag_invalid, status);
5632         return floatx80_default_nan(status);
5633     }
5634     aSig0 = extractFloatx80Frac( a );
5635     aExp = extractFloatx80Exp( a );
5636     aSign = extractFloatx80Sign( a );
5637     if ( aExp == 0x7FFF ) {
5638         if ((uint64_t)(aSig0 << 1)) {
5639             return propagateFloatx80NaN(a, a, status);
5640         }
5641         if ( ! aSign ) return a;
5642         goto invalid;
5643     }
5644     if ( aSign ) {
5645         if ( ( aExp | aSig0 ) == 0 ) return a;
5646  invalid:
5647         float_raise(float_flag_invalid, status);
5648         return floatx80_default_nan(status);
5649     }
5650     if ( aExp == 0 ) {
5651         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5652         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5653     }
5654     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5655     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5656     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5657     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5658     doubleZSig0 = zSig0<<1;
5659     mul64To128( zSig0, zSig0, &term0, &term1 );
5660     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5661     while ( (int64_t) rem0 < 0 ) {
5662         --zSig0;
5663         doubleZSig0 -= 2;
5664         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5665     }
5666     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5667     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5668         if ( zSig1 == 0 ) zSig1 = 1;
5669         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5670         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5671         mul64To128( zSig1, zSig1, &term2, &term3 );
5672         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5673         while ( (int64_t) rem1 < 0 ) {
5674             --zSig1;
5675             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5676             term3 |= 1;
5677             term2 |= doubleZSig0;
5678             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5679         }
5680         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5681     }
5682     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5683     zSig0 |= doubleZSig0;
5684     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5685                                 0, zExp, zSig0, zSig1, status);
5686 }
5687
5688 /*----------------------------------------------------------------------------
5689 | Returns 1 if the extended double-precision floating-point value `a' is equal
5690 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5691 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5692 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5693 *----------------------------------------------------------------------------*/
5694
5695 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5696 {
5697
5698     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5699         || (extractFloatx80Exp(a) == 0x7FFF
5700             && (uint64_t) (extractFloatx80Frac(a) << 1))
5701         || (extractFloatx80Exp(b) == 0x7FFF
5702             && (uint64_t) (extractFloatx80Frac(b) << 1))
5703        ) {
5704         float_raise(float_flag_invalid, status);
5705         return 0;
5706     }
5707     return
5708            ( a.low == b.low )
5709         && (    ( a.high == b.high )
5710              || (    ( a.low == 0 )
5711                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5712            );
5713
5714 }
5715
5716 /*----------------------------------------------------------------------------
5717 | Returns 1 if the extended double-precision floating-point value `a' is
5718 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5719 | invalid exception is raised if either operand is a NaN.  The comparison is
5720 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5721 | Arithmetic.
5722 *----------------------------------------------------------------------------*/
5723
5724 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5725 {
5726     flag aSign, bSign;
5727
5728     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5729         || (extractFloatx80Exp(a) == 0x7FFF
5730             && (uint64_t) (extractFloatx80Frac(a) << 1))
5731         || (extractFloatx80Exp(b) == 0x7FFF
5732             && (uint64_t) (extractFloatx80Frac(b) << 1))
5733        ) {
5734         float_raise(float_flag_invalid, status);
5735         return 0;
5736     }
5737     aSign = extractFloatx80Sign( a );
5738     bSign = extractFloatx80Sign( b );
5739     if ( aSign != bSign ) {
5740         return
5741                aSign
5742             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5743                  == 0 );
5744     }
5745     return
5746           aSign ? le128( b.high, b.low, a.high, a.low )
5747         : le128( a.high, a.low, b.high, b.low );
5748
5749 }
5750
5751 /*----------------------------------------------------------------------------
5752 | Returns 1 if the extended double-precision floating-point value `a' is
5753 | less than the corresponding value `b', and 0 otherwise.  The invalid
5754 | exception is raised if either operand is a NaN.  The comparison is performed
5755 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5756 *----------------------------------------------------------------------------*/
5757
5758 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5759 {
5760     flag aSign, bSign;
5761
5762     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5763         || (extractFloatx80Exp(a) == 0x7FFF
5764             && (uint64_t) (extractFloatx80Frac(a) << 1))
5765         || (extractFloatx80Exp(b) == 0x7FFF
5766             && (uint64_t) (extractFloatx80Frac(b) << 1))
5767        ) {
5768         float_raise(float_flag_invalid, status);
5769         return 0;
5770     }
5771     aSign = extractFloatx80Sign( a );
5772     bSign = extractFloatx80Sign( b );
5773     if ( aSign != bSign ) {
5774         return
5775                aSign
5776             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5777                  != 0 );
5778     }
5779     return
5780           aSign ? lt128( b.high, b.low, a.high, a.low )
5781         : lt128( a.high, a.low, b.high, b.low );
5782
5783 }
5784
5785 /*----------------------------------------------------------------------------
5786 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5787 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5788 | either operand is a NaN.   The comparison is performed according to the
5789 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5790 *----------------------------------------------------------------------------*/
5791 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5792 {
5793     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5794         || (extractFloatx80Exp(a) == 0x7FFF
5795             && (uint64_t) (extractFloatx80Frac(a) << 1))
5796         || (extractFloatx80Exp(b) == 0x7FFF
5797             && (uint64_t) (extractFloatx80Frac(b) << 1))
5798        ) {
5799         float_raise(float_flag_invalid, status);
5800         return 1;
5801     }
5802     return 0;
5803 }
5804
5805 /*----------------------------------------------------------------------------
5806 | Returns 1 if the extended double-precision floating-point value `a' is
5807 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5808 | cause an exception.  The comparison is performed according to the IEC/IEEE
5809 | Standard for Binary Floating-Point Arithmetic.
5810 *----------------------------------------------------------------------------*/
5811
5812 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5813 {
5814
5815     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5816         float_raise(float_flag_invalid, status);
5817         return 0;
5818     }
5819     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5820               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5821          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5822               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5823        ) {
5824         if (floatx80_is_signaling_nan(a, status)
5825          || floatx80_is_signaling_nan(b, status)) {
5826             float_raise(float_flag_invalid, status);
5827         }
5828         return 0;
5829     }
5830     return
5831            ( a.low == b.low )
5832         && (    ( a.high == b.high )
5833              || (    ( a.low == 0 )
5834                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5835            );
5836
5837 }
5838
5839 /*----------------------------------------------------------------------------
5840 | Returns 1 if the extended double-precision floating-point value `a' is less
5841 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5842 | do not cause an exception.  Otherwise, the comparison is performed according
5843 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5844 *----------------------------------------------------------------------------*/
5845
5846 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5847 {
5848     flag aSign, bSign;
5849
5850     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5851         float_raise(float_flag_invalid, status);
5852         return 0;
5853     }
5854     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5855               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5856          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5857               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5858        ) {
5859         if (floatx80_is_signaling_nan(a, status)
5860          || floatx80_is_signaling_nan(b, status)) {
5861             float_raise(float_flag_invalid, status);
5862         }
5863         return 0;
5864     }
5865     aSign = extractFloatx80Sign( a );
5866     bSign = extractFloatx80Sign( b );
5867     if ( aSign != bSign ) {
5868         return
5869                aSign
5870             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5871                  == 0 );
5872     }
5873     return
5874           aSign ? le128( b.high, b.low, a.high, a.low )
5875         : le128( a.high, a.low, b.high, b.low );
5876
5877 }
5878
5879 /*----------------------------------------------------------------------------
5880 | Returns 1 if the extended double-precision floating-point value `a' is less
5881 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5882 | an exception.  Otherwise, the comparison is performed according to the
5883 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5884 *----------------------------------------------------------------------------*/
5885
5886 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5887 {
5888     flag aSign, bSign;
5889
5890     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5891         float_raise(float_flag_invalid, status);
5892         return 0;
5893     }
5894     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5895               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5896          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5897               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5898        ) {
5899         if (floatx80_is_signaling_nan(a, status)
5900          || floatx80_is_signaling_nan(b, status)) {
5901             float_raise(float_flag_invalid, status);
5902         }
5903         return 0;
5904     }
5905     aSign = extractFloatx80Sign( a );
5906     bSign = extractFloatx80Sign( b );
5907     if ( aSign != bSign ) {
5908         return
5909                aSign
5910             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5911                  != 0 );
5912     }
5913     return
5914           aSign ? lt128( b.high, b.low, a.high, a.low )
5915         : lt128( a.high, a.low, b.high, b.low );
5916
5917 }
5918
5919 /*----------------------------------------------------------------------------
5920 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5921 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5922 | The comparison is performed according to the IEC/IEEE Standard for Binary
5923 | Floating-Point Arithmetic.
5924 *----------------------------------------------------------------------------*/
5925 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5926 {
5927     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5928         float_raise(float_flag_invalid, status);
5929         return 1;
5930     }
5931     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5932               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5933          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5934               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5935        ) {
5936         if (floatx80_is_signaling_nan(a, status)
5937          || floatx80_is_signaling_nan(b, status)) {
5938             float_raise(float_flag_invalid, status);
5939         }
5940         return 1;
5941     }
5942     return 0;
5943 }
5944
5945 /*----------------------------------------------------------------------------
5946 | Returns the result of converting the quadruple-precision floating-point
5947 | value `a' to the 32-bit two's complement integer format.  The conversion
5948 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5949 | Arithmetic---which means in particular that the conversion is rounded
5950 | according to the current rounding mode.  If `a' is a NaN, the largest
5951 | positive integer is returned.  Otherwise, if the conversion overflows, the
5952 | largest integer with the same sign as `a' is returned.
5953 *----------------------------------------------------------------------------*/
5954
5955 int32_t float128_to_int32(float128 a, float_status *status)
5956 {
5957     flag aSign;
5958     int32_t aExp, shiftCount;
5959     uint64_t aSig0, aSig1;
5960
5961     aSig1 = extractFloat128Frac1( a );
5962     aSig0 = extractFloat128Frac0( a );
5963     aExp = extractFloat128Exp( a );
5964     aSign = extractFloat128Sign( a );
5965     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5966     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5967     aSig0 |= ( aSig1 != 0 );
5968     shiftCount = 0x4028 - aExp;
5969     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5970     return roundAndPackInt32(aSign, aSig0, status);
5971
5972 }
5973
5974 /*----------------------------------------------------------------------------
5975 | Returns the result of converting the quadruple-precision floating-point
5976 | value `a' to the 32-bit two's complement integer format.  The conversion
5977 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5978 | Arithmetic, except that the conversion is always rounded toward zero.  If
5979 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5980 | conversion overflows, the largest integer with the same sign as `a' is
5981 | returned.
5982 *----------------------------------------------------------------------------*/
5983
5984 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5985 {
5986     flag aSign;
5987     int32_t aExp, shiftCount;
5988     uint64_t aSig0, aSig1, savedASig;
5989     int32_t z;
5990
5991     aSig1 = extractFloat128Frac1( a );
5992     aSig0 = extractFloat128Frac0( a );
5993     aExp = extractFloat128Exp( a );
5994     aSign = extractFloat128Sign( a );
5995     aSig0 |= ( aSig1 != 0 );
5996     if ( 0x401E < aExp ) {
5997         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5998         goto invalid;
5999     }
6000     else if ( aExp < 0x3FFF ) {
6001         if (aExp || aSig0) {
6002             status->float_exception_flags |= float_flag_inexact;
6003         }
6004         return 0;
6005     }
6006     aSig0 |= LIT64( 0x0001000000000000 );
6007     shiftCount = 0x402F - aExp;
6008     savedASig = aSig0;
6009     aSig0 >>= shiftCount;
6010     z = aSig0;
6011     if ( aSign ) z = - z;
6012     if ( ( z < 0 ) ^ aSign ) {
6013  invalid:
6014         float_raise(float_flag_invalid, status);
6015         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6016     }
6017     if ( ( aSig0<<shiftCount ) != savedASig ) {
6018         status->float_exception_flags |= float_flag_inexact;
6019     }
6020     return z;
6021
6022 }
6023
6024 /*----------------------------------------------------------------------------
6025 | Returns the result of converting the quadruple-precision floating-point
6026 | value `a' to the 64-bit two's complement integer format.  The conversion
6027 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6028 | Arithmetic---which means in particular that the conversion is rounded
6029 | according to the current rounding mode.  If `a' is a NaN, the largest
6030 | positive integer is returned.  Otherwise, if the conversion overflows, the
6031 | largest integer with the same sign as `a' is returned.
6032 *----------------------------------------------------------------------------*/
6033
6034 int64_t float128_to_int64(float128 a, float_status *status)
6035 {
6036     flag aSign;
6037     int32_t aExp, shiftCount;
6038     uint64_t aSig0, aSig1;
6039
6040     aSig1 = extractFloat128Frac1( a );
6041     aSig0 = extractFloat128Frac0( a );
6042     aExp = extractFloat128Exp( a );
6043     aSign = extractFloat128Sign( a );
6044     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6045     shiftCount = 0x402F - aExp;
6046     if ( shiftCount <= 0 ) {
6047         if ( 0x403E < aExp ) {
6048             float_raise(float_flag_invalid, status);
6049             if (    ! aSign
6050                  || (    ( aExp == 0x7FFF )
6051                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6052                     )
6053                ) {
6054                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6055             }
6056             return (int64_t) LIT64( 0x8000000000000000 );
6057         }
6058         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6059     }
6060     else {
6061         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6062     }
6063     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6064
6065 }
6066
6067 /*----------------------------------------------------------------------------
6068 | Returns the result of converting the quadruple-precision floating-point
6069 | value `a' to the 64-bit two's complement integer format.  The conversion
6070 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6071 | Arithmetic, except that the conversion is always rounded toward zero.
6072 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6073 | the conversion overflows, the largest integer with the same sign as `a' is
6074 | returned.
6075 *----------------------------------------------------------------------------*/
6076
6077 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6078 {
6079     flag aSign;
6080     int32_t aExp, shiftCount;
6081     uint64_t aSig0, aSig1;
6082     int64_t z;
6083
6084     aSig1 = extractFloat128Frac1( a );
6085     aSig0 = extractFloat128Frac0( a );
6086     aExp = extractFloat128Exp( a );
6087     aSign = extractFloat128Sign( a );
6088     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6089     shiftCount = aExp - 0x402F;
6090     if ( 0 < shiftCount ) {
6091         if ( 0x403E <= aExp ) {
6092             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6093             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6094                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6095                 if (aSig1) {
6096                     status->float_exception_flags |= float_flag_inexact;
6097                 }
6098             }
6099             else {
6100                 float_raise(float_flag_invalid, status);
6101                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6102                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6103                 }
6104             }
6105             return (int64_t) LIT64( 0x8000000000000000 );
6106         }
6107         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6108         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6109             status->float_exception_flags |= float_flag_inexact;
6110         }
6111     }
6112     else {
6113         if ( aExp < 0x3FFF ) {
6114             if ( aExp | aSig0 | aSig1 ) {
6115                 status->float_exception_flags |= float_flag_inexact;
6116             }
6117             return 0;
6118         }
6119         z = aSig0>>( - shiftCount );
6120         if (    aSig1
6121              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6122             status->float_exception_flags |= float_flag_inexact;
6123         }
6124     }
6125     if ( aSign ) z = - z;
6126     return z;
6127
6128 }
6129
6130 /*----------------------------------------------------------------------------
6131 | Returns the result of converting the quadruple-precision floating-point value
6132 | `a' to the 64-bit unsigned integer format.  The conversion is
6133 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6134 | Arithmetic---which means in particular that the conversion is rounded
6135 | according to the current rounding mode.  If `a' is a NaN, the largest
6136 | positive integer is returned.  If the conversion overflows, the
6137 | largest unsigned integer is returned.  If 'a' is negative, the value is
6138 | rounded and zero is returned; negative values that do not round to zero
6139 | will raise the inexact exception.
6140 *----------------------------------------------------------------------------*/
6141
6142 uint64_t float128_to_uint64(float128 a, float_status *status)
6143 {
6144     flag aSign;
6145     int aExp;
6146     int shiftCount;
6147     uint64_t aSig0, aSig1;
6148
6149     aSig0 = extractFloat128Frac0(a);
6150     aSig1 = extractFloat128Frac1(a);
6151     aExp = extractFloat128Exp(a);
6152     aSign = extractFloat128Sign(a);
6153     if (aSign && (aExp > 0x3FFE)) {
6154         float_raise(float_flag_invalid, status);
6155         if (float128_is_any_nan(a)) {
6156             return LIT64(0xFFFFFFFFFFFFFFFF);
6157         } else {
6158             return 0;
6159         }
6160     }
6161     if (aExp) {
6162         aSig0 |= LIT64(0x0001000000000000);
6163     }
6164     shiftCount = 0x402F - aExp;
6165     if (shiftCount <= 0) {
6166         if (0x403E < aExp) {
6167             float_raise(float_flag_invalid, status);
6168             return LIT64(0xFFFFFFFFFFFFFFFF);
6169         }
6170         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6171     } else {
6172         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6173     }
6174     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6175 }
6176
6177 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6178 {
6179     uint64_t v;
6180     signed char current_rounding_mode = status->float_rounding_mode;
6181
6182     set_float_rounding_mode(float_round_to_zero, status);
6183     v = float128_to_uint64(a, status);
6184     set_float_rounding_mode(current_rounding_mode, status);
6185
6186     return v;
6187 }
6188
6189 /*----------------------------------------------------------------------------
6190 | Returns the result of converting the quadruple-precision floating-point
6191 | value `a' to the single-precision floating-point format.  The conversion
6192 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6193 | Arithmetic.
6194 *----------------------------------------------------------------------------*/
6195
6196 float32 float128_to_float32(float128 a, float_status *status)
6197 {
6198     flag aSign;
6199     int32_t aExp;
6200     uint64_t aSig0, aSig1;
6201     uint32_t zSig;
6202
6203     aSig1 = extractFloat128Frac1( a );
6204     aSig0 = extractFloat128Frac0( a );
6205     aExp = extractFloat128Exp( a );
6206     aSign = extractFloat128Sign( a );
6207     if ( aExp == 0x7FFF ) {
6208         if ( aSig0 | aSig1 ) {
6209             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6210         }
6211         return packFloat32( aSign, 0xFF, 0 );
6212     }
6213     aSig0 |= ( aSig1 != 0 );
6214     shift64RightJamming( aSig0, 18, &aSig0 );
6215     zSig = aSig0;
6216     if ( aExp || zSig ) {
6217         zSig |= 0x40000000;
6218         aExp -= 0x3F81;
6219     }
6220     return roundAndPackFloat32(aSign, aExp, zSig, status);
6221
6222 }
6223
6224 /*----------------------------------------------------------------------------
6225 | Returns the result of converting the quadruple-precision floating-point
6226 | value `a' to the double-precision floating-point format.  The conversion
6227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6228 | Arithmetic.
6229 *----------------------------------------------------------------------------*/
6230
6231 float64 float128_to_float64(float128 a, float_status *status)
6232 {
6233     flag aSign;
6234     int32_t aExp;
6235     uint64_t aSig0, aSig1;
6236
6237     aSig1 = extractFloat128Frac1( a );
6238     aSig0 = extractFloat128Frac0( a );
6239     aExp = extractFloat128Exp( a );
6240     aSign = extractFloat128Sign( a );
6241     if ( aExp == 0x7FFF ) {
6242         if ( aSig0 | aSig1 ) {
6243             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6244         }
6245         return packFloat64( aSign, 0x7FF, 0 );
6246     }
6247     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6248     aSig0 |= ( aSig1 != 0 );
6249     if ( aExp || aSig0 ) {
6250         aSig0 |= LIT64( 0x4000000000000000 );
6251         aExp -= 0x3C01;
6252     }
6253     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6254
6255 }
6256
6257 /*----------------------------------------------------------------------------
6258 | Returns the result of converting the quadruple-precision floating-point
6259 | value `a' to the extended double-precision floating-point format.  The
6260 | conversion is performed according to the IEC/IEEE Standard for Binary
6261 | Floating-Point Arithmetic.
6262 *----------------------------------------------------------------------------*/
6263
6264 floatx80 float128_to_floatx80(float128 a, float_status *status)
6265 {
6266     flag aSign;
6267     int32_t aExp;
6268     uint64_t aSig0, aSig1;
6269
6270     aSig1 = extractFloat128Frac1( a );
6271     aSig0 = extractFloat128Frac0( a );
6272     aExp = extractFloat128Exp( a );
6273     aSign = extractFloat128Sign( a );
6274     if ( aExp == 0x7FFF ) {
6275         if ( aSig0 | aSig1 ) {
6276             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6277         }
6278         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6279     }
6280     if ( aExp == 0 ) {
6281         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6282         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6283     }
6284     else {
6285         aSig0 |= LIT64( 0x0001000000000000 );
6286     }
6287     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6288     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6289
6290 }
6291
6292 /*----------------------------------------------------------------------------
6293 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6294 | returns the result as a quadruple-precision floating-point value.  The
6295 | operation is performed according to the IEC/IEEE Standard for Binary
6296 | Floating-Point Arithmetic.
6297 *----------------------------------------------------------------------------*/
6298
6299 float128 float128_round_to_int(float128 a, float_status *status)
6300 {
6301     flag aSign;
6302     int32_t aExp;
6303     uint64_t lastBitMask, roundBitsMask;
6304     float128 z;
6305
6306     aExp = extractFloat128Exp( a );
6307     if ( 0x402F <= aExp ) {
6308         if ( 0x406F <= aExp ) {
6309             if (    ( aExp == 0x7FFF )
6310                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6311                ) {
6312                 return propagateFloat128NaN(a, a, status);
6313             }
6314             return a;
6315         }
6316         lastBitMask = 1;
6317         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6318         roundBitsMask = lastBitMask - 1;
6319         z = a;
6320         switch (status->float_rounding_mode) {
6321         case float_round_nearest_even:
6322             if ( lastBitMask ) {
6323                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6324                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6325             }
6326             else {
6327                 if ( (int64_t) z.low < 0 ) {
6328                     ++z.high;
6329                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6330                 }
6331             }
6332             break;
6333         case float_round_ties_away:
6334             if (lastBitMask) {
6335                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6336             } else {
6337                 if ((int64_t) z.low < 0) {
6338                     ++z.high;
6339                 }
6340             }
6341             break;
6342         case float_round_to_zero:
6343             break;
6344         case float_round_up:
6345             if (!extractFloat128Sign(z)) {
6346                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6347             }
6348             break;
6349         case float_round_down:
6350             if (extractFloat128Sign(z)) {
6351                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6352             }
6353             break;
6354         default:
6355             abort();
6356         }
6357         z.low &= ~ roundBitsMask;
6358     }
6359     else {
6360         if ( aExp < 0x3FFF ) {
6361             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6362             status->float_exception_flags |= float_flag_inexact;
6363             aSign = extractFloat128Sign( a );
6364             switch (status->float_rounding_mode) {
6365              case float_round_nearest_even:
6366                 if (    ( aExp == 0x3FFE )
6367                      && (   extractFloat128Frac0( a )
6368                           | extractFloat128Frac1( a ) )
6369                    ) {
6370                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6371                 }
6372                 break;
6373             case float_round_ties_away:
6374                 if (aExp == 0x3FFE) {
6375                     return packFloat128(aSign, 0x3FFF, 0, 0);
6376                 }
6377                 break;
6378              case float_round_down:
6379                 return
6380                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6381                     : packFloat128( 0, 0, 0, 0 );
6382              case float_round_up:
6383                 return
6384                       aSign ? packFloat128( 1, 0, 0, 0 )
6385                     : packFloat128( 0, 0x3FFF, 0, 0 );
6386             }
6387             return packFloat128( aSign, 0, 0, 0 );
6388         }
6389         lastBitMask = 1;
6390         lastBitMask <<= 0x402F - aExp;
6391         roundBitsMask = lastBitMask - 1;
6392         z.low = 0;
6393         z.high = a.high;
6394         switch (status->float_rounding_mode) {
6395         case float_round_nearest_even:
6396             z.high += lastBitMask>>1;
6397             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6398                 z.high &= ~ lastBitMask;
6399             }
6400             break;
6401         case float_round_ties_away:
6402             z.high += lastBitMask>>1;
6403             break;
6404         case float_round_to_zero:
6405             break;
6406         case float_round_up:
6407             if (!extractFloat128Sign(z)) {
6408                 z.high |= ( a.low != 0 );
6409                 z.high += roundBitsMask;
6410             }
6411             break;
6412         case float_round_down:
6413             if (extractFloat128Sign(z)) {
6414                 z.high |= (a.low != 0);
6415                 z.high += roundBitsMask;
6416             }
6417             break;
6418         default:
6419             abort();
6420         }
6421         z.high &= ~ roundBitsMask;
6422     }
6423     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6424         status->float_exception_flags |= float_flag_inexact;
6425     }
6426     return z;
6427
6428 }
6429
6430 /*----------------------------------------------------------------------------
6431 | Returns the result of adding the absolute values of the quadruple-precision
6432 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6433 | before being returned.  `zSign' is ignored if the result is a NaN.
6434 | The addition is performed according to the IEC/IEEE Standard for Binary
6435 | Floating-Point Arithmetic.
6436 *----------------------------------------------------------------------------*/
6437
6438 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6439                                 float_status *status)
6440 {
6441     int32_t aExp, bExp, zExp;
6442     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6443     int32_t expDiff;
6444
6445     aSig1 = extractFloat128Frac1( a );
6446     aSig0 = extractFloat128Frac0( a );
6447     aExp = extractFloat128Exp( a );
6448     bSig1 = extractFloat128Frac1( b );
6449     bSig0 = extractFloat128Frac0( b );
6450     bExp = extractFloat128Exp( b );
6451     expDiff = aExp - bExp;
6452     if ( 0 < expDiff ) {
6453         if ( aExp == 0x7FFF ) {
6454             if (aSig0 | aSig1) {
6455                 return propagateFloat128NaN(a, b, status);
6456             }
6457             return a;
6458         }
6459         if ( bExp == 0 ) {
6460             --expDiff;
6461         }
6462         else {
6463             bSig0 |= LIT64( 0x0001000000000000 );
6464         }
6465         shift128ExtraRightJamming(
6466             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6467         zExp = aExp;
6468     }
6469     else if ( expDiff < 0 ) {
6470         if ( bExp == 0x7FFF ) {
6471             if (bSig0 | bSig1) {
6472                 return propagateFloat128NaN(a, b, status);
6473             }
6474             return packFloat128( zSign, 0x7FFF, 0, 0 );
6475         }
6476         if ( aExp == 0 ) {
6477             ++expDiff;
6478         }
6479         else {
6480             aSig0 |= LIT64( 0x0001000000000000 );
6481         }
6482         shift128ExtraRightJamming(
6483             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6484         zExp = bExp;
6485     }
6486     else {
6487         if ( aExp == 0x7FFF ) {
6488             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6489                 return propagateFloat128NaN(a, b, status);
6490             }
6491             return a;
6492         }
6493         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6494         if ( aExp == 0 ) {
6495             if (status->flush_to_zero) {
6496                 if (zSig0 | zSig1) {
6497                     float_raise(float_flag_output_denormal, status);
6498                 }
6499                 return packFloat128(zSign, 0, 0, 0);
6500             }
6501             return packFloat128( zSign, 0, zSig0, zSig1 );
6502         }
6503         zSig2 = 0;
6504         zSig0 |= LIT64( 0x0002000000000000 );
6505         zExp = aExp;
6506         goto shiftRight1;
6507     }
6508     aSig0 |= LIT64( 0x0001000000000000 );
6509     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6510     --zExp;
6511     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6512     ++zExp;
6513  shiftRight1:
6514     shift128ExtraRightJamming(
6515         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6516  roundAndPack:
6517     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6518
6519 }
6520
6521 /*----------------------------------------------------------------------------
6522 | Returns the result of subtracting the absolute values of the quadruple-
6523 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6524 | difference is negated before being returned.  `zSign' is ignored if the
6525 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6526 | Standard for Binary Floating-Point Arithmetic.
6527 *----------------------------------------------------------------------------*/
6528
6529 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6530                                 float_status *status)
6531 {
6532     int32_t aExp, bExp, zExp;
6533     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6534     int32_t expDiff;
6535
6536     aSig1 = extractFloat128Frac1( a );
6537     aSig0 = extractFloat128Frac0( a );
6538     aExp = extractFloat128Exp( a );
6539     bSig1 = extractFloat128Frac1( b );
6540     bSig0 = extractFloat128Frac0( b );
6541     bExp = extractFloat128Exp( b );
6542     expDiff = aExp - bExp;
6543     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6544     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6545     if ( 0 < expDiff ) goto aExpBigger;
6546     if ( expDiff < 0 ) goto bExpBigger;
6547     if ( aExp == 0x7FFF ) {
6548         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6549             return propagateFloat128NaN(a, b, status);
6550         }
6551         float_raise(float_flag_invalid, status);
6552         return float128_default_nan(status);
6553     }
6554     if ( aExp == 0 ) {
6555         aExp = 1;
6556         bExp = 1;
6557     }
6558     if ( bSig0 < aSig0 ) goto aBigger;
6559     if ( aSig0 < bSig0 ) goto bBigger;
6560     if ( bSig1 < aSig1 ) goto aBigger;
6561     if ( aSig1 < bSig1 ) goto bBigger;
6562     return packFloat128(status->float_rounding_mode == float_round_down,
6563                         0, 0, 0);
6564  bExpBigger:
6565     if ( bExp == 0x7FFF ) {
6566         if (bSig0 | bSig1) {
6567             return propagateFloat128NaN(a, b, status);
6568         }
6569         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6570     }
6571     if ( aExp == 0 ) {
6572         ++expDiff;
6573     }
6574     else {
6575         aSig0 |= LIT64( 0x4000000000000000 );
6576     }
6577     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6578     bSig0 |= LIT64( 0x4000000000000000 );
6579  bBigger:
6580     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6581     zExp = bExp;
6582     zSign ^= 1;
6583     goto normalizeRoundAndPack;
6584  aExpBigger:
6585     if ( aExp == 0x7FFF ) {
6586         if (aSig0 | aSig1) {
6587             return propagateFloat128NaN(a, b, status);
6588         }
6589         return a;
6590     }
6591     if ( bExp == 0 ) {
6592         --expDiff;
6593     }
6594     else {
6595         bSig0 |= LIT64( 0x4000000000000000 );
6596     }
6597     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6598     aSig0 |= LIT64( 0x4000000000000000 );
6599  aBigger:
6600     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6601     zExp = aExp;
6602  normalizeRoundAndPack:
6603     --zExp;
6604     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6605                                          status);
6606
6607 }
6608
6609 /*----------------------------------------------------------------------------
6610 | Returns the result of adding the quadruple-precision floating-point values
6611 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6612 | for Binary Floating-Point Arithmetic.
6613 *----------------------------------------------------------------------------*/
6614
6615 float128 float128_add(float128 a, float128 b, float_status *status)
6616 {
6617     flag aSign, bSign;
6618
6619     aSign = extractFloat128Sign( a );
6620     bSign = extractFloat128Sign( b );
6621     if ( aSign == bSign ) {
6622         return addFloat128Sigs(a, b, aSign, status);
6623     }
6624     else {
6625         return subFloat128Sigs(a, b, aSign, status);
6626     }
6627
6628 }
6629
6630 /*----------------------------------------------------------------------------
6631 | Returns the result of subtracting the quadruple-precision floating-point
6632 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6633 | Standard for Binary Floating-Point Arithmetic.
6634 *----------------------------------------------------------------------------*/
6635
6636 float128 float128_sub(float128 a, float128 b, float_status *status)
6637 {
6638     flag aSign, bSign;
6639
6640     aSign = extractFloat128Sign( a );
6641     bSign = extractFloat128Sign( b );
6642     if ( aSign == bSign ) {
6643         return subFloat128Sigs(a, b, aSign, status);
6644     }
6645     else {
6646         return addFloat128Sigs(a, b, aSign, status);
6647     }
6648
6649 }
6650
6651 /*----------------------------------------------------------------------------
6652 | Returns the result of multiplying the quadruple-precision floating-point
6653 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6654 | Standard for Binary Floating-Point Arithmetic.
6655 *----------------------------------------------------------------------------*/
6656
6657 float128 float128_mul(float128 a, float128 b, float_status *status)
6658 {
6659     flag aSign, bSign, zSign;
6660     int32_t aExp, bExp, zExp;
6661     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6662
6663     aSig1 = extractFloat128Frac1( a );
6664     aSig0 = extractFloat128Frac0( a );
6665     aExp = extractFloat128Exp( a );
6666     aSign = extractFloat128Sign( a );
6667     bSig1 = extractFloat128Frac1( b );
6668     bSig0 = extractFloat128Frac0( b );
6669     bExp = extractFloat128Exp( b );
6670     bSign = extractFloat128Sign( b );
6671     zSign = aSign ^ bSign;
6672     if ( aExp == 0x7FFF ) {
6673         if (    ( aSig0 | aSig1 )
6674              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6675             return propagateFloat128NaN(a, b, status);
6676         }
6677         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6678         return packFloat128( zSign, 0x7FFF, 0, 0 );
6679     }
6680     if ( bExp == 0x7FFF ) {
6681         if (bSig0 | bSig1) {
6682             return propagateFloat128NaN(a, b, status);
6683         }
6684         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6685  invalid:
6686             float_raise(float_flag_invalid, status);
6687             return float128_default_nan(status);
6688         }
6689         return packFloat128( zSign, 0x7FFF, 0, 0 );
6690     }
6691     if ( aExp == 0 ) {
6692         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6693         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6694     }
6695     if ( bExp == 0 ) {
6696         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6697         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6698     }
6699     zExp = aExp + bExp - 0x4000;
6700     aSig0 |= LIT64( 0x0001000000000000 );
6701     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6702     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6703     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6704     zSig2 |= ( zSig3 != 0 );
6705     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6706         shift128ExtraRightJamming(
6707             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6708         ++zExp;
6709     }
6710     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6711
6712 }
6713
6714 /*----------------------------------------------------------------------------
6715 | Returns the result of dividing the quadruple-precision floating-point value
6716 | `a' by the corresponding value `b'.  The operation is performed according to
6717 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6718 *----------------------------------------------------------------------------*/
6719
6720 float128 float128_div(float128 a, float128 b, float_status *status)
6721 {
6722     flag aSign, bSign, zSign;
6723     int32_t aExp, bExp, zExp;
6724     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6725     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6726
6727     aSig1 = extractFloat128Frac1( a );
6728     aSig0 = extractFloat128Frac0( a );
6729     aExp = extractFloat128Exp( a );
6730     aSign = extractFloat128Sign( a );
6731     bSig1 = extractFloat128Frac1( b );
6732     bSig0 = extractFloat128Frac0( b );
6733     bExp = extractFloat128Exp( b );
6734     bSign = extractFloat128Sign( b );
6735     zSign = aSign ^ bSign;
6736     if ( aExp == 0x7FFF ) {
6737         if (aSig0 | aSig1) {
6738             return propagateFloat128NaN(a, b, status);
6739         }
6740         if ( bExp == 0x7FFF ) {
6741             if (bSig0 | bSig1) {
6742                 return propagateFloat128NaN(a, b, status);
6743             }
6744             goto invalid;
6745         }
6746         return packFloat128( zSign, 0x7FFF, 0, 0 );
6747     }
6748     if ( bExp == 0x7FFF ) {
6749         if (bSig0 | bSig1) {
6750             return propagateFloat128NaN(a, b, status);
6751         }
6752         return packFloat128( zSign, 0, 0, 0 );
6753     }
6754     if ( bExp == 0 ) {
6755         if ( ( bSig0 | bSig1 ) == 0 ) {
6756             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6757  invalid:
6758                 float_raise(float_flag_invalid, status);
6759                 return float128_default_nan(status);
6760             }
6761             float_raise(float_flag_divbyzero, status);
6762             return packFloat128( zSign, 0x7FFF, 0, 0 );
6763         }
6764         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6765     }
6766     if ( aExp == 0 ) {
6767         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6768         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6769     }
6770     zExp = aExp - bExp + 0x3FFD;
6771     shortShift128Left(
6772         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6773     shortShift128Left(
6774         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6775     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6776         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6777         ++zExp;
6778     }
6779     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6780     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6781     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6782     while ( (int64_t) rem0 < 0 ) {
6783         --zSig0;
6784         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6785     }
6786     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6787     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6788         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6789         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6790         while ( (int64_t) rem1 < 0 ) {
6791             --zSig1;
6792             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6793         }
6794         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6795     }
6796     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6797     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6798
6799 }
6800
6801 /*----------------------------------------------------------------------------
6802 | Returns the remainder of the quadruple-precision floating-point value `a'
6803 | with respect to the corresponding value `b'.  The operation is performed
6804 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6805 *----------------------------------------------------------------------------*/
6806
6807 float128 float128_rem(float128 a, float128 b, float_status *status)
6808 {
6809     flag aSign, zSign;
6810     int32_t aExp, bExp, expDiff;
6811     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6812     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6813     int64_t sigMean0;
6814
6815     aSig1 = extractFloat128Frac1( a );
6816     aSig0 = extractFloat128Frac0( a );
6817     aExp = extractFloat128Exp( a );
6818     aSign = extractFloat128Sign( a );
6819     bSig1 = extractFloat128Frac1( b );
6820     bSig0 = extractFloat128Frac0( b );
6821     bExp = extractFloat128Exp( b );
6822     if ( aExp == 0x7FFF ) {
6823         if (    ( aSig0 | aSig1 )
6824              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6825             return propagateFloat128NaN(a, b, status);
6826         }
6827         goto invalid;
6828     }
6829     if ( bExp == 0x7FFF ) {
6830         if (bSig0 | bSig1) {
6831             return propagateFloat128NaN(a, b, status);
6832         }
6833         return a;
6834     }
6835     if ( bExp == 0 ) {
6836         if ( ( bSig0 | bSig1 ) == 0 ) {
6837  invalid:
6838             float_raise(float_flag_invalid, status);
6839             return float128_default_nan(status);
6840         }
6841         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6842     }
6843     if ( aExp == 0 ) {
6844         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6845         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6846     }
6847     expDiff = aExp - bExp;
6848     if ( expDiff < -1 ) return a;
6849     shortShift128Left(
6850         aSig0 | LIT64( 0x0001000000000000 ),
6851         aSig1,
6852         15 - ( expDiff < 0 ),
6853         &aSig0,
6854         &aSig1
6855     );
6856     shortShift128Left(
6857         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6858     q = le128( bSig0, bSig1, aSig0, aSig1 );
6859     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6860     expDiff -= 64;
6861     while ( 0 < expDiff ) {
6862         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6863         q = ( 4 < q ) ? q - 4 : 0;
6864         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6865         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6866         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6867         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6868         expDiff -= 61;
6869     }
6870     if ( -64 < expDiff ) {
6871         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6872         q = ( 4 < q ) ? q - 4 : 0;
6873         q >>= - expDiff;
6874         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6875         expDiff += 52;
6876         if ( expDiff < 0 ) {
6877             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6878         }
6879         else {
6880             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6881         }
6882         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6883         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6884     }
6885     else {
6886         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6887         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6888     }
6889     do {
6890         alternateASig0 = aSig0;
6891         alternateASig1 = aSig1;
6892         ++q;
6893         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6894     } while ( 0 <= (int64_t) aSig0 );
6895     add128(
6896         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6897     if (    ( sigMean0 < 0 )
6898          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6899         aSig0 = alternateASig0;
6900         aSig1 = alternateASig1;
6901     }
6902     zSign = ( (int64_t) aSig0 < 0 );
6903     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6904     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6905                                          status);
6906 }
6907
6908 /*----------------------------------------------------------------------------
6909 | Returns the square root of the quadruple-precision floating-point value `a'.
6910 | The operation is performed according to the IEC/IEEE Standard for Binary
6911 | Floating-Point Arithmetic.
6912 *----------------------------------------------------------------------------*/
6913
6914 float128 float128_sqrt(float128 a, float_status *status)
6915 {
6916     flag aSign;
6917     int32_t aExp, zExp;
6918     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6919     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6920
6921     aSig1 = extractFloat128Frac1( a );
6922     aSig0 = extractFloat128Frac0( a );
6923     aExp = extractFloat128Exp( a );
6924     aSign = extractFloat128Sign( a );
6925     if ( aExp == 0x7FFF ) {
6926         if (aSig0 | aSig1) {
6927             return propagateFloat128NaN(a, a, status);
6928         }
6929         if ( ! aSign ) return a;
6930         goto invalid;
6931     }
6932     if ( aSign ) {
6933         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6934  invalid:
6935         float_raise(float_flag_invalid, status);
6936         return float128_default_nan(status);
6937     }
6938     if ( aExp == 0 ) {
6939         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6940         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6941     }
6942     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6943     aSig0 |= LIT64( 0x0001000000000000 );
6944     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6945     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6946     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6947     doubleZSig0 = zSig0<<1;
6948     mul64To128( zSig0, zSig0, &term0, &term1 );
6949     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6950     while ( (int64_t) rem0 < 0 ) {
6951         --zSig0;
6952         doubleZSig0 -= 2;
6953         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6954     }
6955     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6956     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6957         if ( zSig1 == 0 ) zSig1 = 1;
6958         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6959         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6960         mul64To128( zSig1, zSig1, &term2, &term3 );
6961         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6962         while ( (int64_t) rem1 < 0 ) {
6963             --zSig1;
6964             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6965             term3 |= 1;
6966             term2 |= doubleZSig0;
6967             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6968         }
6969         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6970     }
6971     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6972     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6973
6974 }
6975
6976 /*----------------------------------------------------------------------------
6977 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6978 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6979 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6980 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6981 *----------------------------------------------------------------------------*/
6982
6983 int float128_eq(float128 a, float128 b, float_status *status)
6984 {
6985
6986     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6987               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6988          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6989               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6990        ) {
6991         float_raise(float_flag_invalid, status);
6992         return 0;
6993     }
6994     return
6995            ( a.low == b.low )
6996         && (    ( a.high == b.high )
6997              || (    ( a.low == 0 )
6998                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6999            );
7000
7001 }
7002
7003 /*----------------------------------------------------------------------------
7004 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7005 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7006 | exception is raised if either operand is a NaN.  The comparison is performed
7007 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7008 *----------------------------------------------------------------------------*/
7009
7010 int float128_le(float128 a, float128 b, float_status *status)
7011 {
7012     flag aSign, bSign;
7013
7014     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7015               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7016          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7017               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7018        ) {
7019         float_raise(float_flag_invalid, status);
7020         return 0;
7021     }
7022     aSign = extractFloat128Sign( a );
7023     bSign = extractFloat128Sign( b );
7024     if ( aSign != bSign ) {
7025         return
7026                aSign
7027             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7028                  == 0 );
7029     }
7030     return
7031           aSign ? le128( b.high, b.low, a.high, a.low )
7032         : le128( a.high, a.low, b.high, b.low );
7033
7034 }
7035
7036 /*----------------------------------------------------------------------------
7037 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7038 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7039 | raised if either operand is a NaN.  The comparison is performed according
7040 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7041 *----------------------------------------------------------------------------*/
7042
7043 int float128_lt(float128 a, float128 b, float_status *status)
7044 {
7045     flag aSign, bSign;
7046
7047     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7048               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7049          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7050               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7051        ) {
7052         float_raise(float_flag_invalid, status);
7053         return 0;
7054     }
7055     aSign = extractFloat128Sign( a );
7056     bSign = extractFloat128Sign( b );
7057     if ( aSign != bSign ) {
7058         return
7059                aSign
7060             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7061                  != 0 );
7062     }
7063     return
7064           aSign ? lt128( b.high, b.low, a.high, a.low )
7065         : lt128( a.high, a.low, b.high, b.low );
7066
7067 }
7068
7069 /*----------------------------------------------------------------------------
7070 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7071 | be compared, and 0 otherwise.  The invalid exception is raised if either
7072 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7073 | Standard for Binary Floating-Point Arithmetic.
7074 *----------------------------------------------------------------------------*/
7075
7076 int float128_unordered(float128 a, float128 b, float_status *status)
7077 {
7078     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7079               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7080          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7081               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7082        ) {
7083         float_raise(float_flag_invalid, status);
7084         return 1;
7085     }
7086     return 0;
7087 }
7088
7089 /*----------------------------------------------------------------------------
7090 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7091 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7092 | exception.  The comparison is performed according to the IEC/IEEE Standard
7093 | for Binary Floating-Point Arithmetic.
7094 *----------------------------------------------------------------------------*/
7095
7096 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7097 {
7098
7099     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7100               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7101          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7102               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7103        ) {
7104         if (float128_is_signaling_nan(a, status)
7105          || float128_is_signaling_nan(b, status)) {
7106             float_raise(float_flag_invalid, status);
7107         }
7108         return 0;
7109     }
7110     return
7111            ( a.low == b.low )
7112         && (    ( a.high == b.high )
7113              || (    ( a.low == 0 )
7114                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7115            );
7116
7117 }
7118
7119 /*----------------------------------------------------------------------------
7120 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7121 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7122 | cause an exception.  Otherwise, the comparison is performed according to the
7123 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7124 *----------------------------------------------------------------------------*/
7125
7126 int float128_le_quiet(float128 a, float128 b, float_status *status)
7127 {
7128     flag aSign, bSign;
7129
7130     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7131               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7132          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7133               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7134        ) {
7135         if (float128_is_signaling_nan(a, status)
7136          || float128_is_signaling_nan(b, status)) {
7137             float_raise(float_flag_invalid, status);
7138         }
7139         return 0;
7140     }
7141     aSign = extractFloat128Sign( a );
7142     bSign = extractFloat128Sign( b );
7143     if ( aSign != bSign ) {
7144         return
7145                aSign
7146             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7147                  == 0 );
7148     }
7149     return
7150           aSign ? le128( b.high, b.low, a.high, a.low )
7151         : le128( a.high, a.low, b.high, b.low );
7152
7153 }
7154
7155 /*----------------------------------------------------------------------------
7156 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7157 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7158 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7159 | Standard for Binary Floating-Point Arithmetic.
7160 *----------------------------------------------------------------------------*/
7161
7162 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7163 {
7164     flag aSign, bSign;
7165
7166     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7167               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7168          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7169               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7170        ) {
7171         if (float128_is_signaling_nan(a, status)
7172          || float128_is_signaling_nan(b, status)) {
7173             float_raise(float_flag_invalid, status);
7174         }
7175         return 0;
7176     }
7177     aSign = extractFloat128Sign( a );
7178     bSign = extractFloat128Sign( b );
7179     if ( aSign != bSign ) {
7180         return
7181                aSign
7182             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7183                  != 0 );
7184     }
7185     return
7186           aSign ? lt128( b.high, b.low, a.high, a.low )
7187         : lt128( a.high, a.low, b.high, b.low );
7188
7189 }
7190
7191 /*----------------------------------------------------------------------------
7192 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7193 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7194 | comparison is performed according to the IEC/IEEE Standard for Binary
7195 | Floating-Point Arithmetic.
7196 *----------------------------------------------------------------------------*/
7197
7198 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7199 {
7200     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7201               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7202          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7203               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7204        ) {
7205         if (float128_is_signaling_nan(a, status)
7206          || float128_is_signaling_nan(b, status)) {
7207             float_raise(float_flag_invalid, status);
7208         }
7209         return 1;
7210     }
7211     return 0;
7212 }
7213
7214 /* misc functions */
7215 float32 uint32_to_float32(uint32_t a, float_status *status)
7216 {
7217     return int64_to_float32(a, status);
7218 }
7219
7220 float64 uint32_to_float64(uint32_t a, float_status *status)
7221 {
7222     return int64_to_float64(a, status);
7223 }
7224
7225 uint32_t float32_to_uint32(float32 a, float_status *status)
7226 {
7227     int64_t v;
7228     uint32_t res;
7229     int old_exc_flags = get_float_exception_flags(status);
7230
7231     v = float32_to_int64(a, status);
7232     if (v < 0) {
7233         res = 0;
7234     } else if (v > 0xffffffff) {
7235         res = 0xffffffff;
7236     } else {
7237         return v;
7238     }
7239     set_float_exception_flags(old_exc_flags, status);
7240     float_raise(float_flag_invalid, status);
7241     return res;
7242 }
7243
7244 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7245 {
7246     int64_t v;
7247     uint32_t res;
7248     int old_exc_flags = get_float_exception_flags(status);
7249
7250     v = float32_to_int64_round_to_zero(a, status);
7251     if (v < 0) {
7252         res = 0;
7253     } else if (v > 0xffffffff) {
7254         res = 0xffffffff;
7255     } else {
7256         return v;
7257     }
7258     set_float_exception_flags(old_exc_flags, status);
7259     float_raise(float_flag_invalid, status);
7260     return res;
7261 }
7262
7263 int16_t float32_to_int16(float32 a, float_status *status)
7264 {
7265     int32_t v;
7266     int16_t res;
7267     int old_exc_flags = get_float_exception_flags(status);
7268
7269     v = float32_to_int32(a, status);
7270     if (v < -0x8000) {
7271         res = -0x8000;
7272     } else if (v > 0x7fff) {
7273         res = 0x7fff;
7274     } else {
7275         return v;
7276     }
7277
7278     set_float_exception_flags(old_exc_flags, status);
7279     float_raise(float_flag_invalid, status);
7280     return res;
7281 }
7282
7283 uint16_t float32_to_uint16(float32 a, float_status *status)
7284 {
7285     int32_t v;
7286     uint16_t res;
7287     int old_exc_flags = get_float_exception_flags(status);
7288
7289     v = float32_to_int32(a, status);
7290     if (v < 0) {
7291         res = 0;
7292     } else if (v > 0xffff) {
7293         res = 0xffff;
7294     } else {
7295         return v;
7296     }
7297
7298     set_float_exception_flags(old_exc_flags, status);
7299     float_raise(float_flag_invalid, status);
7300     return res;
7301 }
7302
7303 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7304 {
7305     int64_t v;
7306     uint16_t res;
7307     int old_exc_flags = get_float_exception_flags(status);
7308
7309     v = float32_to_int64_round_to_zero(a, status);
7310     if (v < 0) {
7311         res = 0;
7312     } else if (v > 0xffff) {
7313         res = 0xffff;
7314     } else {
7315         return v;
7316     }
7317     set_float_exception_flags(old_exc_flags, status);
7318     float_raise(float_flag_invalid, status);
7319     return res;
7320 }
7321
7322 uint32_t float64_to_uint32(float64 a, float_status *status)
7323 {
7324     uint64_t v;
7325     uint32_t res;
7326     int old_exc_flags = get_float_exception_flags(status);
7327
7328     v = float64_to_uint64(a, status);
7329     if (v > 0xffffffff) {
7330         res = 0xffffffff;
7331     } else {
7332         return v;
7333     }
7334     set_float_exception_flags(old_exc_flags, status);
7335     float_raise(float_flag_invalid, status);
7336     return res;
7337 }
7338
7339 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7340 {
7341     uint64_t v;
7342     uint32_t res;
7343     int old_exc_flags = get_float_exception_flags(status);
7344
7345     v = float64_to_uint64_round_to_zero(a, status);
7346     if (v > 0xffffffff) {
7347         res = 0xffffffff;
7348     } else {
7349         return v;
7350     }
7351     set_float_exception_flags(old_exc_flags, status);
7352     float_raise(float_flag_invalid, status);
7353     return res;
7354 }
7355
7356 int16_t float64_to_int16(float64 a, float_status *status)
7357 {
7358     int64_t v;
7359     int16_t res;
7360     int old_exc_flags = get_float_exception_flags(status);
7361
7362     v = float64_to_int32(a, status);
7363     if (v < -0x8000) {
7364         res = -0x8000;
7365     } else if (v > 0x7fff) {
7366         res = 0x7fff;
7367     } else {
7368         return v;
7369     }
7370
7371     set_float_exception_flags(old_exc_flags, status);
7372     float_raise(float_flag_invalid, status);
7373     return res;
7374 }
7375
7376 uint16_t float64_to_uint16(float64 a, float_status *status)
7377 {
7378     int64_t v;
7379     uint16_t res;
7380     int old_exc_flags = get_float_exception_flags(status);
7381
7382     v = float64_to_int32(a, status);
7383     if (v < 0) {
7384         res = 0;
7385     } else if (v > 0xffff) {
7386         res = 0xffff;
7387     } else {
7388         return v;
7389     }
7390
7391     set_float_exception_flags(old_exc_flags, status);
7392     float_raise(float_flag_invalid, status);
7393     return res;
7394 }
7395
7396 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7397 {
7398     int64_t v;
7399     uint16_t res;
7400     int old_exc_flags = get_float_exception_flags(status);
7401
7402     v = float64_to_int64_round_to_zero(a, status);
7403     if (v < 0) {
7404         res = 0;
7405     } else if (v > 0xffff) {
7406         res = 0xffff;
7407     } else {
7408         return v;
7409     }
7410     set_float_exception_flags(old_exc_flags, status);
7411     float_raise(float_flag_invalid, status);
7412     return res;
7413 }
7414
7415 /*----------------------------------------------------------------------------
7416 | Returns the result of converting the double-precision floating-point value
7417 | `a' to the 64-bit unsigned integer format.  The conversion is
7418 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7419 | Arithmetic---which means in particular that the conversion is rounded
7420 | according to the current rounding mode.  If `a' is a NaN, the largest
7421 | positive integer is returned.  If the conversion overflows, the
7422 | largest unsigned integer is returned.  If 'a' is negative, the value is
7423 | rounded and zero is returned; negative values that do not round to zero
7424 | will raise the inexact exception.
7425 *----------------------------------------------------------------------------*/
7426
7427 uint64_t float64_to_uint64(float64 a, float_status *status)
7428 {
7429     flag aSign;
7430     int aExp;
7431     int shiftCount;
7432     uint64_t aSig, aSigExtra;
7433     a = float64_squash_input_denormal(a, status);
7434
7435     aSig = extractFloat64Frac(a);
7436     aExp = extractFloat64Exp(a);
7437     aSign = extractFloat64Sign(a);
7438     if (aSign && (aExp > 1022)) {
7439         float_raise(float_flag_invalid, status);
7440         if (float64_is_any_nan(a)) {
7441             return LIT64(0xFFFFFFFFFFFFFFFF);
7442         } else {
7443             return 0;
7444         }
7445     }
7446     if (aExp) {
7447         aSig |= LIT64(0x0010000000000000);
7448     }
7449     shiftCount = 0x433 - aExp;
7450     if (shiftCount <= 0) {
7451         if (0x43E < aExp) {
7452             float_raise(float_flag_invalid, status);
7453             return LIT64(0xFFFFFFFFFFFFFFFF);
7454         }
7455         aSigExtra = 0;
7456         aSig <<= -shiftCount;
7457     } else {
7458         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7459     }
7460     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7461 }
7462
7463 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7464 {
7465     signed char current_rounding_mode = status->float_rounding_mode;
7466     set_float_rounding_mode(float_round_to_zero, status);
7467     int64_t v = float64_to_uint64(a, status);
7468     set_float_rounding_mode(current_rounding_mode, status);
7469     return v;
7470 }
7471
7472 #define COMPARE(s, nan_exp)                                                  \
7473 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7474                                       int is_quiet, float_status *status)    \
7475 {                                                                            \
7476     flag aSign, bSign;                                                       \
7477     uint ## s ## _t av, bv;                                                  \
7478     a = float ## s ## _squash_input_denormal(a, status);                     \
7479     b = float ## s ## _squash_input_denormal(b, status);                     \
7480                                                                              \
7481     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7482          extractFloat ## s ## Frac( a ) ) ||                                 \
7483         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7484           extractFloat ## s ## Frac( b ) )) {                                \
7485         if (!is_quiet ||                                                     \
7486             float ## s ## _is_signaling_nan(a, status) ||                  \
7487             float ## s ## _is_signaling_nan(b, status)) {                 \
7488             float_raise(float_flag_invalid, status);                         \
7489         }                                                                    \
7490         return float_relation_unordered;                                     \
7491     }                                                                        \
7492     aSign = extractFloat ## s ## Sign( a );                                  \
7493     bSign = extractFloat ## s ## Sign( b );                                  \
7494     av = float ## s ## _val(a);                                              \
7495     bv = float ## s ## _val(b);                                              \
7496     if ( aSign != bSign ) {                                                  \
7497         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7498             /* zero case */                                                  \
7499             return float_relation_equal;                                     \
7500         } else {                                                             \
7501             return 1 - (2 * aSign);                                          \
7502         }                                                                    \
7503     } else {                                                                 \
7504         if (av == bv) {                                                      \
7505             return float_relation_equal;                                     \
7506         } else {                                                             \
7507             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7508         }                                                                    \
7509     }                                                                        \
7510 }                                                                            \
7511                                                                              \
7512 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7513 {                                                                            \
7514     return float ## s ## _compare_internal(a, b, 0, status);                 \
7515 }                                                                            \
7516                                                                              \
7517 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7518                                  float_status *status)                       \
7519 {                                                                            \
7520     return float ## s ## _compare_internal(a, b, 1, status);                 \
7521 }
7522
7523 COMPARE(32, 0xff)
7524 COMPARE(64, 0x7ff)
7525
7526 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7527                                             int is_quiet, float_status *status)
7528 {
7529     flag aSign, bSign;
7530
7531     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7532         float_raise(float_flag_invalid, status);
7533         return float_relation_unordered;
7534     }
7535     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7536           ( extractFloatx80Frac( a )<<1 ) ) ||
7537         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7538           ( extractFloatx80Frac( b )<<1 ) )) {
7539         if (!is_quiet ||
7540             floatx80_is_signaling_nan(a, status) ||
7541             floatx80_is_signaling_nan(b, status)) {
7542             float_raise(float_flag_invalid, status);
7543         }
7544         return float_relation_unordered;
7545     }
7546     aSign = extractFloatx80Sign( a );
7547     bSign = extractFloatx80Sign( b );
7548     if ( aSign != bSign ) {
7549
7550         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7551              ( ( a.low | b.low ) == 0 ) ) {
7552             /* zero case */
7553             return float_relation_equal;
7554         } else {
7555             return 1 - (2 * aSign);
7556         }
7557     } else {
7558         if (a.low == b.low && a.high == b.high) {
7559             return float_relation_equal;
7560         } else {
7561             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7562         }
7563     }
7564 }
7565
7566 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7567 {
7568     return floatx80_compare_internal(a, b, 0, status);
7569 }
7570
7571 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7572 {
7573     return floatx80_compare_internal(a, b, 1, status);
7574 }
7575
7576 static inline int float128_compare_internal(float128 a, float128 b,
7577                                             int is_quiet, float_status *status)
7578 {
7579     flag aSign, bSign;
7580
7581     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7582           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7583         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7584           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7585         if (!is_quiet ||
7586             float128_is_signaling_nan(a, status) ||
7587             float128_is_signaling_nan(b, status)) {
7588             float_raise(float_flag_invalid, status);
7589         }
7590         return float_relation_unordered;
7591     }
7592     aSign = extractFloat128Sign( a );
7593     bSign = extractFloat128Sign( b );
7594     if ( aSign != bSign ) {
7595         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7596             /* zero case */
7597             return float_relation_equal;
7598         } else {
7599             return 1 - (2 * aSign);
7600         }
7601     } else {
7602         if (a.low == b.low && a.high == b.high) {
7603             return float_relation_equal;
7604         } else {
7605             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7606         }
7607     }
7608 }
7609
7610 int float128_compare(float128 a, float128 b, float_status *status)
7611 {
7612     return float128_compare_internal(a, b, 0, status);
7613 }
7614
7615 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7616 {
7617     return float128_compare_internal(a, b, 1, status);
7618 }
7619
7620 /* min() and max() functions. These can't be implemented as
7621  * 'compare and pick one input' because that would mishandle
7622  * NaNs and +0 vs -0.
7623  *
7624  * minnum() and maxnum() functions. These are similar to the min()
7625  * and max() functions but if one of the arguments is a QNaN and
7626  * the other is numerical then the numerical argument is returned.
7627  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7628  * and maxNum() operations. min() and max() are the typical min/max
7629  * semantics provided by many CPUs which predate that specification.
7630  *
7631  * minnummag() and maxnummag() functions correspond to minNumMag()
7632  * and minNumMag() from the IEEE-754 2008.
7633  */
7634 #define MINMAX(s)                                                       \
7635 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7636                                                int ismin, int isieee,   \
7637                                                int ismag,               \
7638                                                float_status *status)    \
7639 {                                                                       \
7640     flag aSign, bSign;                                                  \
7641     uint ## s ## _t av, bv, aav, abv;                                   \
7642     a = float ## s ## _squash_input_denormal(a, status);                \
7643     b = float ## s ## _squash_input_denormal(b, status);                \
7644     if (float ## s ## _is_any_nan(a) ||                                 \
7645         float ## s ## _is_any_nan(b)) {                                 \
7646         if (isieee) {                                                   \
7647             if (float ## s ## _is_quiet_nan(a, status) &&               \
7648                 !float ## s ##_is_any_nan(b)) {                         \
7649                 return b;                                               \
7650             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7651                        !float ## s ## _is_any_nan(a)) {                \
7652                 return a;                                               \
7653             }                                                           \
7654         }                                                               \
7655         return propagateFloat ## s ## NaN(a, b, status);                \
7656     }                                                                   \
7657     aSign = extractFloat ## s ## Sign(a);                               \
7658     bSign = extractFloat ## s ## Sign(b);                               \
7659     av = float ## s ## _val(a);                                         \
7660     bv = float ## s ## _val(b);                                         \
7661     if (ismag) {                                                        \
7662         aav = float ## s ## _abs(av);                                   \
7663         abv = float ## s ## _abs(bv);                                   \
7664         if (aav != abv) {                                               \
7665             if (ismin) {                                                \
7666                 return (aav < abv) ? a : b;                             \
7667             } else {                                                    \
7668                 return (aav < abv) ? b : a;                             \
7669             }                                                           \
7670         }                                                               \
7671     }                                                                   \
7672     if (aSign != bSign) {                                               \
7673         if (ismin) {                                                    \
7674             return aSign ? a : b;                                       \
7675         } else {                                                        \
7676             return aSign ? b : a;                                       \
7677         }                                                               \
7678     } else {                                                            \
7679         if (ismin) {                                                    \
7680             return (aSign ^ (av < bv)) ? a : b;                         \
7681         } else {                                                        \
7682             return (aSign ^ (av < bv)) ? b : a;                         \
7683         }                                                               \
7684     }                                                                   \
7685 }                                                                       \
7686                                                                         \
7687 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7688                               float_status *status)                     \
7689 {                                                                       \
7690     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7691 }                                                                       \
7692                                                                         \
7693 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7694                               float_status *status)                     \
7695 {                                                                       \
7696     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7697 }                                                                       \
7698                                                                         \
7699 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7700                                  float_status *status)                  \
7701 {                                                                       \
7702     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7703 }                                                                       \
7704                                                                         \
7705 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7706                                  float_status *status)                  \
7707 {                                                                       \
7708     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7709 }                                                                       \
7710                                                                         \
7711 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7712                                     float_status *status)               \
7713 {                                                                       \
7714     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7715 }                                                                       \
7716                                                                         \
7717 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7718                                     float_status *status)               \
7719 {                                                                       \
7720     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7721 }
7722
7723 MINMAX(32)
7724 MINMAX(64)
7725
7726
7727 /* Multiply A by 2 raised to the power N.  */
7728 float32 float32_scalbn(float32 a, int n, float_status *status)
7729 {
7730     flag aSign;
7731     int16_t aExp;
7732     uint32_t aSig;
7733
7734     a = float32_squash_input_denormal(a, status);
7735     aSig = extractFloat32Frac( a );
7736     aExp = extractFloat32Exp( a );
7737     aSign = extractFloat32Sign( a );
7738
7739     if ( aExp == 0xFF ) {
7740         if ( aSig ) {
7741             return propagateFloat32NaN(a, a, status);
7742         }
7743         return a;
7744     }
7745     if (aExp != 0) {
7746         aSig |= 0x00800000;
7747     } else if (aSig == 0) {
7748         return a;
7749     } else {
7750         aExp++;
7751     }
7752
7753     if (n > 0x200) {
7754         n = 0x200;
7755     } else if (n < -0x200) {
7756         n = -0x200;
7757     }
7758
7759     aExp += n - 1;
7760     aSig <<= 7;
7761     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7762 }
7763
7764 float64 float64_scalbn(float64 a, int n, float_status *status)
7765 {
7766     flag aSign;
7767     int16_t aExp;
7768     uint64_t aSig;
7769
7770     a = float64_squash_input_denormal(a, status);
7771     aSig = extractFloat64Frac( a );
7772     aExp = extractFloat64Exp( a );
7773     aSign = extractFloat64Sign( a );
7774
7775     if ( aExp == 0x7FF ) {
7776         if ( aSig ) {
7777             return propagateFloat64NaN(a, a, status);
7778         }
7779         return a;
7780     }
7781     if (aExp != 0) {
7782         aSig |= LIT64( 0x0010000000000000 );
7783     } else if (aSig == 0) {
7784         return a;
7785     } else {
7786         aExp++;
7787     }
7788
7789     if (n > 0x1000) {
7790         n = 0x1000;
7791     } else if (n < -0x1000) {
7792         n = -0x1000;
7793     }
7794
7795     aExp += n - 1;
7796     aSig <<= 10;
7797     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7798 }
7799
7800 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7801 {
7802     flag aSign;
7803     int32_t aExp;
7804     uint64_t aSig;
7805
7806     if (floatx80_invalid_encoding(a)) {
7807         float_raise(float_flag_invalid, status);
7808         return floatx80_default_nan(status);
7809     }
7810     aSig = extractFloatx80Frac( a );
7811     aExp = extractFloatx80Exp( a );
7812     aSign = extractFloatx80Sign( a );
7813
7814     if ( aExp == 0x7FFF ) {
7815         if ( aSig<<1 ) {
7816             return propagateFloatx80NaN(a, a, status);
7817         }
7818         return a;
7819     }
7820
7821     if (aExp == 0) {
7822         if (aSig == 0) {
7823             return a;
7824         }
7825         aExp++;
7826     }
7827
7828     if (n > 0x10000) {
7829         n = 0x10000;
7830     } else if (n < -0x10000) {
7831         n = -0x10000;
7832     }
7833
7834     aExp += n;
7835     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7836                                          aSign, aExp, aSig, 0, status);
7837 }
7838
7839 float128 float128_scalbn(float128 a, int n, float_status *status)
7840 {
7841     flag aSign;
7842     int32_t aExp;
7843     uint64_t aSig0, aSig1;
7844
7845     aSig1 = extractFloat128Frac1( a );
7846     aSig0 = extractFloat128Frac0( a );
7847     aExp = extractFloat128Exp( a );
7848     aSign = extractFloat128Sign( a );
7849     if ( aExp == 0x7FFF ) {
7850         if ( aSig0 | aSig1 ) {
7851             return propagateFloat128NaN(a, a, status);
7852         }
7853         return a;
7854     }
7855     if (aExp != 0) {
7856         aSig0 |= LIT64( 0x0001000000000000 );
7857     } else if (aSig0 == 0 && aSig1 == 0) {
7858         return a;
7859     } else {
7860         aExp++;
7861     }
7862
7863     if (n > 0x10000) {
7864         n = 0x10000;
7865     } else if (n < -0x10000) {
7866         n = -0x10000;
7867     }
7868
7869     aExp += n - 1;
7870     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7871                                          , status);
7872
7873 }