fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 /* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
  34    be flushed to zero.  */
  35 #include "softfloat.h"
  36
  37 /*----------------------------------------------------------------------------
  38 | Primitive arithmetic functions, including multi-word arithmetic, and
  39 | division and square root approximations.  (Can be specialized to target if
  40 | desired.)
  41 *----------------------------------------------------------------------------*/
  42 #include "softfloat-macros.h"
  43
  44 /*----------------------------------------------------------------------------
  45 | Functions and definitions to determine:  (1) whether tininess for underflow
  46 | is detected before or after rounding by default, (2) what (if anything)
  47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  49 | are propagated from function inputs to output.  These details are target-
  50 | specific.
  51 *----------------------------------------------------------------------------*/
  52 #include "softfloat-specialize.h"
  53
  54 void set_float_rounding_mode(int val STATUS_PARAM)
  55 {
  56     STATUS(float_rounding_mode) = val;
  57 }
  58
  59 void set_float_exception_flags(int val STATUS_PARAM)
  60 {
  61     STATUS(float_exception_flags) = val;
  62 }
  63
  64 #ifdef FLOATX80
  65 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  66 {
  67     STATUS(floatx80_rounding_precision) = val;
  68 }
  69 #endif
  70
  71 /*----------------------------------------------------------------------------
  72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  74 | input.  If `zSign' is 1, the input is negated before being converted to an
  75 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  76 | is simply rounded to an integer, with the inexact exception raised if the
  77 | input cannot be represented exactly as an integer.  However, if the fixed-
  78 | point input is too large, the invalid exception is raised and the largest
  79 | positive or negative integer is returned.
  80 *----------------------------------------------------------------------------*/
  81
  82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  83 {
  84     int8 roundingMode;
  85     flag roundNearestEven;
  86     int8 roundIncrement, roundBits;
  87     int32 z;
  88
  89     roundingMode = STATUS(float_rounding_mode);
  90     roundNearestEven = ( roundingMode == float_round_nearest_even );
  91     roundIncrement = 0x40;
  92     if ( ! roundNearestEven ) {
  93         if ( roundingMode == float_round_to_zero ) {
  94             roundIncrement = 0;
  95         }
  96         else {
  97             roundIncrement = 0x7F;
  98             if ( zSign ) {
  99                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 100             }
 101             else {
 102                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 103             }
 104         }
 105     }
 106     roundBits = absZ & 0x7F;
 107     absZ = ( absZ + roundIncrement )>>7;
 108     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 109     z = absZ;
 110     if ( zSign ) z = - z;
 111     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 112         float_raise( float_flag_invalid STATUS_VAR);
 113         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 114     }
 115     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 116     return z;
 117
 118 }
 119
 120 /*----------------------------------------------------------------------------
 121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 123 | and returns the properly rounded 64-bit integer corresponding to the input.
 124 | If `zSign' is 1, the input is negated before being converted to an integer.
 125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 126 | the inexact exception raised if the input cannot be represented exactly as
 127 | an integer.  However, if the fixed-point input is too large, the invalid
 128 | exception is raised and the largest positive or negative integer is
 129 | returned.
 130 *----------------------------------------------------------------------------*/
 131
 132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 133 {
 134     int8 roundingMode;
 135     flag roundNearestEven, increment;
 136     int64 z;
 137
 138     roundingMode = STATUS(float_rounding_mode);
 139     roundNearestEven = ( roundingMode == float_round_nearest_even );
 140     increment = ( (sbits64) absZ1 < 0 );
 141     if ( ! roundNearestEven ) {
 142         if ( roundingMode == float_round_to_zero ) {
 143             increment = 0;
 144         }
 145         else {
 146             if ( zSign ) {
 147                 increment = ( roundingMode == float_round_down ) && absZ1;
 148             }
 149             else {
 150                 increment = ( roundingMode == float_round_up ) && absZ1;
 151             }
 152         }
 153     }
 154     if ( increment ) {
 155         ++absZ0;
 156         if ( absZ0 == 0 ) goto overflow;
 157         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 158     }
 159     z = absZ0;
 160     if ( zSign ) z = - z;
 161     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 162  overflow:
 163         float_raise( float_flag_invalid STATUS_VAR);
 164         return
 165               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 166             : LIT64( 0x7FFFFFFFFFFFFFFF );
 167     }
 168     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 169     return z;
 170
 171 }
 172
 173 /*----------------------------------------------------------------------------
 174 | Returns the fraction bits of the single-precision floating-point value `a'.
 175 *----------------------------------------------------------------------------*/
 176
 177 INLINE bits32 extractFloat32Frac( float32 a )
 178 {
 179
 180     return float32_val(a) & 0x007FFFFF;
 181
 182 }
 183
 184 /*----------------------------------------------------------------------------
 185 | Returns the exponent bits of the single-precision floating-point value `a'.
 186 *----------------------------------------------------------------------------*/
 187
 188 INLINE int16 extractFloat32Exp( float32 a )
 189 {
 190
 191     return ( float32_val(a)>>23 ) & 0xFF;
 192
 193 }
 194
 195 /*----------------------------------------------------------------------------
 196 | Returns the sign bit of the single-precision floating-point value `a'.
 197 *----------------------------------------------------------------------------*/
 198
 199 INLINE flag extractFloat32Sign( float32 a )
 200 {
 201
 202     return float32_val(a)>>31;
 203
 204 }
 205
 206 /*----------------------------------------------------------------------------
 207 | Normalizes the subnormal single-precision floating-point value represented
 208 | by the denormalized significand `aSig'.  The normalized exponent and
 209 | significand are stored at the locations pointed to by `zExpPtr' and
 210 | `zSigPtr', respectively.
 211 *----------------------------------------------------------------------------*/
 212
 213 static void
 214  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 215 {
 216     int8 shiftCount;
 217
 218     shiftCount = countLeadingZeros32( aSig ) - 8;
 219     *zSigPtr = aSig<<shiftCount;
 220     *zExpPtr = 1 - shiftCount;
 221
 222 }
 223
 224 /*----------------------------------------------------------------------------
 225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 226 | single-precision floating-point value, returning the result.  After being
 227 | shifted into the proper positions, the three fields are simply added
 228 | together to form the result.  This means that any integer portion of `zSig'
 229 | will be added into the exponent.  Since a properly normalized significand
 230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 231 | than the desired result exponent whenever `zSig' is a complete, normalized
 232 | significand.
 233 *----------------------------------------------------------------------------*/
 234
 235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 236 {
 237
 238     return make_float32(
 239           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
 240
 241 }
 242
 243 /*----------------------------------------------------------------------------
 244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 245 | and significand `zSig', and returns the proper single-precision floating-
 246 | point value corresponding to the abstract input.  Ordinarily, the abstract
 247 | value is simply rounded and packed into the single-precision format, with
 248 | the inexact exception raised if the abstract input cannot be represented
 249 | exactly.  However, if the abstract value is too large, the overflow and
 250 | inexact exceptions are raised and an infinity or maximal finite value is
 251 | returned.  If the abstract value is too small, the input value is rounded to
 252 | a subnormal number, and the underflow and inexact exceptions are raised if
 253 | the abstract input cannot be represented exactly as a subnormal single-
 254 | precision floating-point number.
 255 |     The input significand `zSig' has its binary point between bits 30
 256 | and 29, which is 7 bits to the left of the usual location.  This shifted
 257 | significand must be normalized or smaller.  If `zSig' is not normalized,
 258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 259 | and it must not require rounding.  In the usual case that `zSig' is
 260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 262 | Binary Floating-Point Arithmetic.
 263 *----------------------------------------------------------------------------*/
 264
 265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 266 {
 267     int8 roundingMode;
 268     flag roundNearestEven;
 269     int8 roundIncrement, roundBits;
 270     flag isTiny;
 271
 272     roundingMode = STATUS(float_rounding_mode);
 273     roundNearestEven = ( roundingMode == float_round_nearest_even );
 274     roundIncrement = 0x40;
 275     if ( ! roundNearestEven ) {
 276         if ( roundingMode == float_round_to_zero ) {
 277             roundIncrement = 0;
 278         }
 279         else {
 280             roundIncrement = 0x7F;
 281             if ( zSign ) {
 282                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 283             }
 284             else {
 285                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 286             }
 287         }
 288     }
 289     roundBits = zSig & 0x7F;
 290     if ( 0xFD <= (bits16) zExp ) {
 291         if (    ( 0xFD < zExp )
 292              || (    ( zExp == 0xFD )
 293                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 294            ) {
 295             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 296             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 297         }
 298         if ( zExp < 0 ) {
 299             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 300             isTiny =
 301                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 302                 || ( zExp < -1 )
 303                 || ( zSig + roundIncrement < 0x80000000 );
 304             shift32RightJamming( zSig, - zExp, &zSig );
 305             zExp = 0;
 306             roundBits = zSig & 0x7F;
 307             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 308         }
 309     }
 310     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 311     zSig = ( zSig + roundIncrement )>>7;
 312     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 313     if ( zSig == 0 ) zExp = 0;
 314     return packFloat32( zSign, zExp, zSig );
 315
 316 }
 317
 318 /*----------------------------------------------------------------------------
 319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 320 | and significand `zSig', and returns the proper single-precision floating-
 321 | point value corresponding to the abstract input.  This routine is just like
 322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 324 | floating-point exponent.
 325 *----------------------------------------------------------------------------*/
 326
 327 static float32
 328  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 329 {
 330     int8 shiftCount;
 331
 332     shiftCount = countLeadingZeros32( zSig ) - 1;
 333     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 334
 335 }
 336
 337 /*----------------------------------------------------------------------------
 338 | Returns the fraction bits of the double-precision floating-point value `a'.
 339 *----------------------------------------------------------------------------*/
 340
 341 INLINE bits64 extractFloat64Frac( float64 a )
 342 {
 343
 344     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 345
 346 }
 347
 348 /*----------------------------------------------------------------------------
 349 | Returns the exponent bits of the double-precision floating-point value `a'.
 350 *----------------------------------------------------------------------------*/
 351
 352 INLINE int16 extractFloat64Exp( float64 a )
 353 {
 354
 355     return ( float64_val(a)>>52 ) & 0x7FF;
 356
 357 }
 358
 359 /*----------------------------------------------------------------------------
 360 | Returns the sign bit of the double-precision floating-point value `a'.
 361 *----------------------------------------------------------------------------*/
 362
 363 INLINE flag extractFloat64Sign( float64 a )
 364 {
 365
 366     return float64_val(a)>>63;
 367
 368 }
 369
 370 /*----------------------------------------------------------------------------
 371 | Normalizes the subnormal double-precision floating-point value represented
 372 | by the denormalized significand `aSig'.  The normalized exponent and
 373 | significand are stored at the locations pointed to by `zExpPtr' and
 374 | `zSigPtr', respectively.
 375 *----------------------------------------------------------------------------*/
 376
 377 static void
 378  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 379 {
 380     int8 shiftCount;
 381
 382     shiftCount = countLeadingZeros64( aSig ) - 11;
 383     *zSigPtr = aSig<<shiftCount;
 384     *zExpPtr = 1 - shiftCount;
 385
 386 }
 387
 388 /*----------------------------------------------------------------------------
 389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 390 | double-precision floating-point value, returning the result.  After being
 391 | shifted into the proper positions, the three fields are simply added
 392 | together to form the result.  This means that any integer portion of `zSig'
 393 | will be added into the exponent.  Since a properly normalized significand
 394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 395 | than the desired result exponent whenever `zSig' is a complete, normalized
 396 | significand.
 397 *----------------------------------------------------------------------------*/
 398
 399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 400 {
 401
 402     return make_float64(
 403         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
 404
 405 }
 406
 407 /*----------------------------------------------------------------------------
 408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 409 | and significand `zSig', and returns the proper double-precision floating-
 410 | point value corresponding to the abstract input.  Ordinarily, the abstract
 411 | value is simply rounded and packed into the double-precision format, with
 412 | the inexact exception raised if the abstract input cannot be represented
 413 | exactly.  However, if the abstract value is too large, the overflow and
 414 | inexact exceptions are raised and an infinity or maximal finite value is
 415 | returned.  If the abstract value is too small, the input value is rounded
 416 | to a subnormal number, and the underflow and inexact exceptions are raised
 417 | if the abstract input cannot be represented exactly as a subnormal double-
 418 | precision floating-point number.
 419 |     The input significand `zSig' has its binary point between bits 62
 420 | and 61, which is 10 bits to the left of the usual location.  This shifted
 421 | significand must be normalized or smaller.  If `zSig' is not normalized,
 422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 423 | and it must not require rounding.  In the usual case that `zSig' is
 424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 426 | Binary Floating-Point Arithmetic.
 427 *----------------------------------------------------------------------------*/
 428
 429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 430 {
 431     int8 roundingMode;
 432     flag roundNearestEven;
 433     int16 roundIncrement, roundBits;
 434     flag isTiny;
 435
 436     roundingMode = STATUS(float_rounding_mode);
 437     roundNearestEven = ( roundingMode == float_round_nearest_even );
 438     roundIncrement = 0x200;
 439     if ( ! roundNearestEven ) {
 440         if ( roundingMode == float_round_to_zero ) {
 441             roundIncrement = 0;
 442         }
 443         else {
 444             roundIncrement = 0x3FF;
 445             if ( zSign ) {
 446                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 447             }
 448             else {
 449                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 450             }
 451         }
 452     }
 453     roundBits = zSig & 0x3FF;
 454     if ( 0x7FD <= (bits16) zExp ) {
 455         if (    ( 0x7FD < zExp )
 456              || (    ( zExp == 0x7FD )
 457                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 458            ) {
 459             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 460             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 461         }
 462         if ( zExp < 0 ) {
 463             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 464             isTiny =
 465                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 466                 || ( zExp < -1 )
 467                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 468             shift64RightJamming( zSig, - zExp, &zSig );
 469             zExp = 0;
 470             roundBits = zSig & 0x3FF;
 471             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 472         }
 473     }
 474     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 475     zSig = ( zSig + roundIncrement )>>10;
 476     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 477     if ( zSig == 0 ) zExp = 0;
 478     return packFloat64( zSign, zExp, zSig );
 479
 480 }
 481
 482 /*----------------------------------------------------------------------------
 483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 484 | and significand `zSig', and returns the proper double-precision floating-
 485 | point value corresponding to the abstract input.  This routine is just like
 486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 488 | floating-point exponent.
 489 *----------------------------------------------------------------------------*/
 490
 491 static float64
 492  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 493 {
 494     int8 shiftCount;
 495
 496     shiftCount = countLeadingZeros64( zSig ) - 1;
 497     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 498
 499 }
 500
 501 #ifdef FLOATX80
 502
 503 /*----------------------------------------------------------------------------
 504 | Returns the fraction bits of the extended double-precision floating-point
 505 | value `a'.
 506 *----------------------------------------------------------------------------*/
 507
 508 INLINE bits64 extractFloatx80Frac( floatx80 a )
 509 {
 510
 511     return a.low;
 512
 513 }
 514
 515 /*----------------------------------------------------------------------------
 516 | Returns the exponent bits of the extended double-precision floating-point
 517 | value `a'.
 518 *----------------------------------------------------------------------------*/
 519
 520 INLINE int32 extractFloatx80Exp( floatx80 a )
 521 {
 522
 523     return a.high & 0x7FFF;
 524
 525 }
 526
 527 /*----------------------------------------------------------------------------
 528 | Returns the sign bit of the extended double-precision floating-point value
 529 | `a'.
 530 *----------------------------------------------------------------------------*/
 531
 532 INLINE flag extractFloatx80Sign( floatx80 a )
 533 {
 534
 535     return a.high>>15;
 536
 537 }
 538
 539 /*----------------------------------------------------------------------------
 540 | Normalizes the subnormal extended double-precision floating-point value
 541 | represented by the denormalized significand `aSig'.  The normalized exponent
 542 | and significand are stored at the locations pointed to by `zExpPtr' and
 543 | `zSigPtr', respectively.
 544 *----------------------------------------------------------------------------*/
 545
 546 static void
 547  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 548 {
 549     int8 shiftCount;
 550
 551     shiftCount = countLeadingZeros64( aSig );
 552     *zSigPtr = aSig<<shiftCount;
 553     *zExpPtr = 1 - shiftCount;
 554
 555 }
 556
 557 /*----------------------------------------------------------------------------
 558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 559 | extended double-precision floating-point value, returning the result.
 560 *----------------------------------------------------------------------------*/
 561
 562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 563 {
 564     floatx80 z;
 565
 566     z.low = zSig;
 567     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 568     return z;
 569
 570 }
 571
 572 /*----------------------------------------------------------------------------
 573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 575 | and returns the proper extended double-precision floating-point value
 576 | corresponding to the abstract input.  Ordinarily, the abstract value is
 577 | rounded and packed into the extended double-precision format, with the
 578 | inexact exception raised if the abstract input cannot be represented
 579 | exactly.  However, if the abstract value is too large, the overflow and
 580 | inexact exceptions are raised and an infinity or maximal finite value is
 581 | returned.  If the abstract value is too small, the input value is rounded to
 582 | a subnormal number, and the underflow and inexact exceptions are raised if
 583 | the abstract input cannot be represented exactly as a subnormal extended
 584 | double-precision floating-point number.
 585 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 586 | number of bits as single or double precision, respectively.  Otherwise, the
 587 | result is rounded to the full precision of the extended double-precision
 588 | format.
 589 |     The input significand must be normalized or smaller.  If the input
 590 | significand is not normalized, `zExp' must be 0; in that case, the result
 591 | returned is a subnormal number, and it must not require rounding.  The
 592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 593 | Floating-Point Arithmetic.
 594 *----------------------------------------------------------------------------*/
 595
 596 static floatx80
 597  roundAndPackFloatx80(
 598      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 599  STATUS_PARAM)
 600 {
 601     int8 roundingMode;
 602     flag roundNearestEven, increment, isTiny;
 603     int64 roundIncrement, roundMask, roundBits;
 604
 605     roundingMode = STATUS(float_rounding_mode);
 606     roundNearestEven = ( roundingMode == float_round_nearest_even );
 607     if ( roundingPrecision == 80 ) goto precision80;
 608     if ( roundingPrecision == 64 ) {
 609         roundIncrement = LIT64( 0x0000000000000400 );
 610         roundMask = LIT64( 0x00000000000007FF );
 611     }
 612     else if ( roundingPrecision == 32 ) {
 613         roundIncrement = LIT64( 0x0000008000000000 );
 614         roundMask = LIT64( 0x000000FFFFFFFFFF );
 615     }
 616     else {
 617         goto precision80;
 618     }
 619     zSig0 |= ( zSig1 != 0 );
 620     if ( ! roundNearestEven ) {
 621         if ( roundingMode == float_round_to_zero ) {
 622             roundIncrement = 0;
 623         }
 624         else {
 625             roundIncrement = roundMask;
 626             if ( zSign ) {
 627                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 628             }
 629             else {
 630                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 631             }
 632         }
 633     }
 634     roundBits = zSig0 & roundMask;
 635     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 636         if (    ( 0x7FFE < zExp )
 637              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 638            ) {
 639             goto overflow;
 640         }
 641         if ( zExp <= 0 ) {
 642             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 643             isTiny =
 644                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 645                 || ( zExp < 0 )
 646                 || ( zSig0 <= zSig0 + roundIncrement );
 647             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 648             zExp = 0;
 649             roundBits = zSig0 & roundMask;
 650             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 651             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 652             zSig0 += roundIncrement;
 653             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 654             roundIncrement = roundMask + 1;
 655             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 656                 roundMask |= roundIncrement;
 657             }
 658             zSig0 &= ~ roundMask;
 659             return packFloatx80( zSign, zExp, zSig0 );
 660         }
 661     }
 662     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 663     zSig0 += roundIncrement;
 664     if ( zSig0 < roundIncrement ) {
 665         ++zExp;
 666         zSig0 = LIT64( 0x8000000000000000 );
 667     }
 668     roundIncrement = roundMask + 1;
 669     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 670         roundMask |= roundIncrement;
 671     }
 672     zSig0 &= ~ roundMask;
 673     if ( zSig0 == 0 ) zExp = 0;
 674     return packFloatx80( zSign, zExp, zSig0 );
 675  precision80:
 676     increment = ( (sbits64) zSig1 < 0 );
 677     if ( ! roundNearestEven ) {
 678         if ( roundingMode == float_round_to_zero ) {
 679             increment = 0;
 680         }
 681         else {
 682             if ( zSign ) {
 683                 increment = ( roundingMode == float_round_down ) && zSig1;
 684             }
 685             else {
 686                 increment = ( roundingMode == float_round_up ) && zSig1;
 687             }
 688         }
 689     }
 690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 691         if (    ( 0x7FFE < zExp )
 692              || (    ( zExp == 0x7FFE )
 693                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 694                   && increment
 695                 )
 696            ) {
 697             roundMask = 0;
 698  overflow:
 699             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 700             if (    ( roundingMode == float_round_to_zero )
 701                  || ( zSign && ( roundingMode == float_round_up ) )
 702                  || ( ! zSign && ( roundingMode == float_round_down ) )
 703                ) {
 704                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 705             }
 706             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 707         }
 708         if ( zExp <= 0 ) {
 709             isTiny =
 710                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 711                 || ( zExp < 0 )
 712                 || ! increment
 713                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 714             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 715             zExp = 0;
 716             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 717             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 718             if ( roundNearestEven ) {
 719                 increment = ( (sbits64) zSig1 < 0 );
 720             }
 721             else {
 722                 if ( zSign ) {
 723                     increment = ( roundingMode == float_round_down ) && zSig1;
 724                 }
 725                 else {
 726                     increment = ( roundingMode == float_round_up ) && zSig1;
 727                 }
 728             }
 729             if ( increment ) {
 730                 ++zSig0;
 731                 zSig0 &=
 732                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 733                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 734             }
 735             return packFloatx80( zSign, zExp, zSig0 );
 736         }
 737     }
 738     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 739     if ( increment ) {
 740         ++zSig0;
 741         if ( zSig0 == 0 ) {
 742             ++zExp;
 743             zSig0 = LIT64( 0x8000000000000000 );
 744         }
 745         else {
 746             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 747         }
 748     }
 749     else {
 750         if ( zSig0 == 0 ) zExp = 0;
 751     }
 752     return packFloatx80( zSign, zExp, zSig0 );
 753
 754 }
 755
 756 /*----------------------------------------------------------------------------
 757 | Takes an abstract floating-point value having sign `zSign', exponent
 758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 759 | and returns the proper extended double-precision floating-point value
 760 | corresponding to the abstract input.  This routine is just like
 761 | `roundAndPackFloatx80' except that the input significand does not have to be
 762 | normalized.
 763 *----------------------------------------------------------------------------*/
 764
 765 static floatx80
 766  normalizeRoundAndPackFloatx80(
 767      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 768  STATUS_PARAM)
 769 {
 770     int8 shiftCount;
 771
 772     if ( zSig0 == 0 ) {
 773         zSig0 = zSig1;
 774         zSig1 = 0;
 775         zExp -= 64;
 776     }
 777     shiftCount = countLeadingZeros64( zSig0 );
 778     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 779     zExp -= shiftCount;
 780     return
 781         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 782
 783 }
 784
 785 #endif
 786
 787 #ifdef FLOAT128
 788
 789 /*----------------------------------------------------------------------------
 790 | Returns the least-significant 64 fraction bits of the quadruple-precision
 791 | floating-point value `a'.
 792 *----------------------------------------------------------------------------*/
 793
 794 INLINE bits64 extractFloat128Frac1( float128 a )
 795 {
 796
 797     return a.low;
 798
 799 }
 800
 801 /*----------------------------------------------------------------------------
 802 | Returns the most-significant 48 fraction bits of the quadruple-precision
 803 | floating-point value `a'.
 804 *----------------------------------------------------------------------------*/
 805
 806 INLINE bits64 extractFloat128Frac0( float128 a )
 807 {
 808
 809     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 810
 811 }
 812
 813 /*----------------------------------------------------------------------------
 814 | Returns the exponent bits of the quadruple-precision floating-point value
 815 | `a'.
 816 *----------------------------------------------------------------------------*/
 817
 818 INLINE int32 extractFloat128Exp( float128 a )
 819 {
 820
 821     return ( a.high>>48 ) & 0x7FFF;
 822
 823 }
 824
 825 /*----------------------------------------------------------------------------
 826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 827 *----------------------------------------------------------------------------*/
 828
 829 INLINE flag extractFloat128Sign( float128 a )
 830 {
 831
 832     return a.high>>63;
 833
 834 }
 835
 836 /*----------------------------------------------------------------------------
 837 | Normalizes the subnormal quadruple-precision floating-point value
 838 | represented by the denormalized significand formed by the concatenation of
 839 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 840 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 841 | significand are stored at the location pointed to by `zSig0Ptr', and the
 842 | least significant 64 bits of the normalized significand are stored at the
 843 | location pointed to by `zSig1Ptr'.
 844 *----------------------------------------------------------------------------*/
 845
 846 static void
 847  normalizeFloat128Subnormal(
 848      bits64 aSig0,
 849      bits64 aSig1,
 850      int32 *zExpPtr,
 851      bits64 *zSig0Ptr,
 852      bits64 *zSig1Ptr
 853  )
 854 {
 855     int8 shiftCount;
 856
 857     if ( aSig0 == 0 ) {
 858         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 859         if ( shiftCount < 0 ) {
 860             *zSig0Ptr = aSig1>>( - shiftCount );
 861             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 862         }
 863         else {
 864             *zSig0Ptr = aSig1<<shiftCount;
 865             *zSig1Ptr = 0;
 866         }
 867         *zExpPtr = - shiftCount - 63;
 868     }
 869     else {
 870         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 871         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 872         *zExpPtr = 1 - shiftCount;
 873     }
 874
 875 }
 876
 877 /*----------------------------------------------------------------------------
 878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 880 | floating-point value, returning the result.  After being shifted into the
 881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 882 | added together to form the most significant 32 bits of the result.  This
 883 | means that any integer portion of `zSig0' will be added into the exponent.
 884 | Since a properly normalized significand will have an integer portion equal
 885 | to 1, the `zExp' input should be 1 less than the desired result exponent
 886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 887 | significand.
 888 *----------------------------------------------------------------------------*/
 889
 890 INLINE float128
 891  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 892 {
 893     float128 z;
 894
 895     z.low = zSig1;
 896     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 897     return z;
 898
 899 }
 900
 901 /*----------------------------------------------------------------------------
 902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 904 | and `zSig2', and returns the proper quadruple-precision floating-point value
 905 | corresponding to the abstract input.  Ordinarily, the abstract value is
 906 | simply rounded and packed into the quadruple-precision format, with the
 907 | inexact exception raised if the abstract input cannot be represented
 908 | exactly.  However, if the abstract value is too large, the overflow and
 909 | inexact exceptions are raised and an infinity or maximal finite value is
 910 | returned.  If the abstract value is too small, the input value is rounded to
 911 | a subnormal number, and the underflow and inexact exceptions are raised if
 912 | the abstract input cannot be represented exactly as a subnormal quadruple-
 913 | precision floating-point number.
 914 |     The input significand must be normalized or smaller.  If the input
 915 | significand is not normalized, `zExp' must be 0; in that case, the result
 916 | returned is a subnormal number, and it must not require rounding.  In the
 917 | usual case that the input significand is normalized, `zExp' must be 1 less
 918 | than the ``true'' floating-point exponent.  The handling of underflow and
 919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 920 *----------------------------------------------------------------------------*/
 921
 922 static float128
 923  roundAndPackFloat128(
 924      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 925 {
 926     int8 roundingMode;
 927     flag roundNearestEven, increment, isTiny;
 928
 929     roundingMode = STATUS(float_rounding_mode);
 930     roundNearestEven = ( roundingMode == float_round_nearest_even );
 931     increment = ( (sbits64) zSig2 < 0 );
 932     if ( ! roundNearestEven ) {
 933         if ( roundingMode == float_round_to_zero ) {
 934             increment = 0;
 935         }
 936         else {
 937             if ( zSign ) {
 938                 increment = ( roundingMode == float_round_down ) && zSig2;
 939             }
 940             else {
 941                 increment = ( roundingMode == float_round_up ) && zSig2;
 942             }
 943         }
 944     }
 945     if ( 0x7FFD <= (bits32) zExp ) {
 946         if (    ( 0x7FFD < zExp )
 947              || (    ( zExp == 0x7FFD )
 948                   && eq128(
 949                          LIT64( 0x0001FFFFFFFFFFFF ),
 950                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 951                          zSig0,
 952                          zSig1
 953                      )
 954                   && increment
 955                 )
 956            ) {
 957             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 958             if (    ( roundingMode == float_round_to_zero )
 959                  || ( zSign && ( roundingMode == float_round_up ) )
 960                  || ( ! zSign && ( roundingMode == float_round_down ) )
 961                ) {
 962                 return
 963                     packFloat128(
 964                         zSign,
 965                         0x7FFE,
 966                         LIT64( 0x0000FFFFFFFFFFFF ),
 967                         LIT64( 0xFFFFFFFFFFFFFFFF )
 968                     );
 969             }
 970             return packFloat128( zSign, 0x7FFF, 0, 0 );
 971         }
 972         if ( zExp < 0 ) {
 973             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
 974             isTiny =
 975                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 976                 || ( zExp < -1 )
 977                 || ! increment
 978                 || lt128(
 979                        zSig0,
 980                        zSig1,
 981                        LIT64( 0x0001FFFFFFFFFFFF ),
 982                        LIT64( 0xFFFFFFFFFFFFFFFF )
 983                    );
 984             shift128ExtraRightJamming(
 985                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 986             zExp = 0;
 987             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
 988             if ( roundNearestEven ) {
 989                 increment = ( (sbits64) zSig2 < 0 );
 990             }
 991             else {
 992                 if ( zSign ) {
 993                     increment = ( roundingMode == float_round_down ) && zSig2;
 994                 }
 995                 else {
 996                     increment = ( roundingMode == float_round_up ) && zSig2;
 997                 }
 998             }
 999         }
1000     }
1001     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002     if ( increment ) {
1003         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005     }
1006     else {
1007         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1008     }
1009     return packFloat128( zSign, zExp, zSig0, zSig1 );
1010
1011 }
1012
1013 /*----------------------------------------------------------------------------
1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1016 | returns the proper quadruple-precision floating-point value corresponding
1017 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1018 | except that the input significand has fewer bits and does not have to be
1019 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1020 | point exponent.
1021 *----------------------------------------------------------------------------*/
1022
1023 static float128
1024  normalizeRoundAndPackFloat128(
1025      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026 {
1027     int8 shiftCount;
1028     bits64 zSig2;
1029
1030     if ( zSig0 == 0 ) {
1031         zSig0 = zSig1;
1032         zSig1 = 0;
1033         zExp -= 64;
1034     }
1035     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036     if ( 0 <= shiftCount ) {
1037         zSig2 = 0;
1038         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039     }
1040     else {
1041         shift128ExtraRightJamming(
1042             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043     }
1044     zExp -= shiftCount;
1045     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046
1047 }
1048
1049 #endif
1050
1051 /*----------------------------------------------------------------------------
1052 | Returns the result of converting the 32-bit two's complement integer `a'
1053 | to the single-precision floating-point format.  The conversion is performed
1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055 *----------------------------------------------------------------------------*/
1056
1057 float32 int32_to_float32( int32 a STATUS_PARAM )
1058 {
1059     flag zSign;
1060
1061     if ( a == 0 ) return float32_zero;
1062     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063     zSign = ( a < 0 );
1064     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065
1066 }
1067
1068 /*----------------------------------------------------------------------------
1069 | Returns the result of converting the 32-bit two's complement integer `a'
1070 | to the double-precision floating-point format.  The conversion is performed
1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072 *----------------------------------------------------------------------------*/
1073
1074 float64 int32_to_float64( int32 a STATUS_PARAM )
1075 {
1076     flag zSign;
1077     uint32 absA;
1078     int8 shiftCount;
1079     bits64 zSig;
1080
1081     if ( a == 0 ) return float64_zero;
1082     zSign = ( a < 0 );
1083     absA = zSign ? - a : a;
1084     shiftCount = countLeadingZeros32( absA ) + 21;
1085     zSig = absA;
1086     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087
1088 }
1089
1090 #ifdef FLOATX80
1091
1092 /*----------------------------------------------------------------------------
1093 | Returns the result of converting the 32-bit two's complement integer `a'
1094 | to the extended double-precision floating-point format.  The conversion
1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096 | Arithmetic.
1097 *----------------------------------------------------------------------------*/
1098
1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100 {
1101     flag zSign;
1102     uint32 absA;
1103     int8 shiftCount;
1104     bits64 zSig;
1105
1106     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107     zSign = ( a < 0 );
1108     absA = zSign ? - a : a;
1109     shiftCount = countLeadingZeros32( absA ) + 32;
1110     zSig = absA;
1111     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112
1113 }
1114
1115 #endif
1116
1117 #ifdef FLOAT128
1118
1119 /*----------------------------------------------------------------------------
1120 | Returns the result of converting the 32-bit two's complement integer `a' to
1121 | the quadruple-precision floating-point format.  The conversion is performed
1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 *----------------------------------------------------------------------------*/
1124
1125 float128 int32_to_float128( int32 a STATUS_PARAM )
1126 {
1127     flag zSign;
1128     uint32 absA;
1129     int8 shiftCount;
1130     bits64 zSig0;
1131
1132     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133     zSign = ( a < 0 );
1134     absA = zSign ? - a : a;
1135     shiftCount = countLeadingZeros32( absA ) + 17;
1136     zSig0 = absA;
1137     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138
1139 }
1140
1141 #endif
1142
1143 /*----------------------------------------------------------------------------
1144 | Returns the result of converting the 64-bit two's complement integer `a'
1145 | to the single-precision floating-point format.  The conversion is performed
1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147 *----------------------------------------------------------------------------*/
1148
1149 float32 int64_to_float32( int64 a STATUS_PARAM )
1150 {
1151     flag zSign;
1152     uint64 absA;
1153     int8 shiftCount;
1154
1155     if ( a == 0 ) return float32_zero;
1156     zSign = ( a < 0 );
1157     absA = zSign ? - a : a;
1158     shiftCount = countLeadingZeros64( absA ) - 40;
1159     if ( 0 <= shiftCount ) {
1160         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161     }
1162     else {
1163         shiftCount += 7;
1164         if ( shiftCount < 0 ) {
1165             shift64RightJamming( absA, - shiftCount, &absA );
1166         }
1167         else {
1168             absA <<= shiftCount;
1169         }
1170         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171     }
1172
1173 }
1174
1175 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176 {
1177     int8 shiftCount;
1178
1179     if ( a == 0 ) return float32_zero;
1180     shiftCount = countLeadingZeros64( a ) - 40;
1181     if ( 0 <= shiftCount ) {
1182         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183     }
1184     else {
1185         shiftCount += 7;
1186         if ( shiftCount < 0 ) {
1187             shift64RightJamming( a, - shiftCount, &a );
1188         }
1189         else {
1190             a <<= shiftCount;
1191         }
1192         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193     }
1194 }
1195
1196 /*----------------------------------------------------------------------------
1197 | Returns the result of converting the 64-bit two's complement integer `a'
1198 | to the double-precision floating-point format.  The conversion is performed
1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200 *----------------------------------------------------------------------------*/
1201
1202 float64 int64_to_float64( int64 a STATUS_PARAM )
1203 {
1204     flag zSign;
1205
1206     if ( a == 0 ) return float64_zero;
1207     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208         return packFloat64( 1, 0x43E, 0 );
1209     }
1210     zSign = ( a < 0 );
1211     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212
1213 }
1214
1215 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216 {
1217     if ( a == 0 ) return float64_zero;
1218     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219
1220 }
1221
1222 #ifdef FLOATX80
1223
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the extended double-precision floating-point format.  The conversion
1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228 | Arithmetic.
1229 *----------------------------------------------------------------------------*/
1230
1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232 {
1233     flag zSign;
1234     uint64 absA;
1235     int8 shiftCount;
1236
1237     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238     zSign = ( a < 0 );
1239     absA = zSign ? - a : a;
1240     shiftCount = countLeadingZeros64( absA );
1241     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242
1243 }
1244
1245 #endif
1246
1247 #ifdef FLOAT128
1248
1249 /*----------------------------------------------------------------------------
1250 | Returns the result of converting the 64-bit two's complement integer `a' to
1251 | the quadruple-precision floating-point format.  The conversion is performed
1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 *----------------------------------------------------------------------------*/
1254
1255 float128 int64_to_float128( int64 a STATUS_PARAM )
1256 {
1257     flag zSign;
1258     uint64 absA;
1259     int8 shiftCount;
1260     int32 zExp;
1261     bits64 zSig0, zSig1;
1262
1263     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264     zSign = ( a < 0 );
1265     absA = zSign ? - a : a;
1266     shiftCount = countLeadingZeros64( absA ) + 49;
1267     zExp = 0x406E - shiftCount;
1268     if ( 64 <= shiftCount ) {
1269         zSig1 = 0;
1270         zSig0 = absA;
1271         shiftCount -= 64;
1272     }
1273     else {
1274         zSig1 = absA;
1275         zSig0 = 0;
1276     }
1277     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278     return packFloat128( zSign, zExp, zSig0, zSig1 );
1279
1280 }
1281
1282 #endif
1283
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the single-precision floating-point value
1286 | `a' to the 32-bit two's complement integer format.  The conversion is
1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic---which means in particular that the conversion is rounded
1289 | according to the current rounding mode.  If `a' is a NaN, the largest
1290 | positive integer is returned.  Otherwise, if the conversion overflows, the
1291 | largest integer with the same sign as `a' is returned.
1292 *----------------------------------------------------------------------------*/
1293
1294 int32 float32_to_int32( float32 a STATUS_PARAM )
1295 {
1296     flag aSign;
1297     int16 aExp, shiftCount;
1298     bits32 aSig;
1299     bits64 aSig64;
1300
1301     aSig = extractFloat32Frac( a );
1302     aExp = extractFloat32Exp( a );
1303     aSign = extractFloat32Sign( a );
1304     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305     if ( aExp ) aSig |= 0x00800000;
1306     shiftCount = 0xAF - aExp;
1307     aSig64 = aSig;
1308     aSig64 <<= 32;
1309     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311
1312 }
1313
1314 /*----------------------------------------------------------------------------
1315 | Returns the result of converting the single-precision floating-point value
1316 | `a' to the 32-bit two's complement integer format.  The conversion is
1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 | Arithmetic, except that the conversion is always rounded toward zero.
1319 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1320 | the conversion overflows, the largest integer with the same sign as `a' is
1321 | returned.
1322 *----------------------------------------------------------------------------*/
1323
1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325 {
1326     flag aSign;
1327     int16 aExp, shiftCount;
1328     bits32 aSig;
1329     int32 z;
1330
1331     aSig = extractFloat32Frac( a );
1332     aExp = extractFloat32Exp( a );
1333     aSign = extractFloat32Sign( a );
1334     shiftCount = aExp - 0x9E;
1335     if ( 0 <= shiftCount ) {
1336         if ( float32_val(a) != 0xCF000000 ) {
1337             float_raise( float_flag_invalid STATUS_VAR);
1338             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339         }
1340         return (sbits32) 0x80000000;
1341     }
1342     else if ( aExp <= 0x7E ) {
1343         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344         return 0;
1345     }
1346     aSig = ( aSig | 0x00800000 )<<8;
1347     z = aSig>>( - shiftCount );
1348     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349         STATUS(float_exception_flags) |= float_flag_inexact;
1350     }
1351     if ( aSign ) z = - z;
1352     return z;
1353
1354 }
1355
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 64-bit two's complement integer format.  The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic---which means in particular that the conversion is rounded
1361 | according to the current rounding mode.  If `a' is a NaN, the largest
1362 | positive integer is returned.  Otherwise, if the conversion overflows, the
1363 | largest integer with the same sign as `a' is returned.
1364 *----------------------------------------------------------------------------*/
1365
1366 int64 float32_to_int64( float32 a STATUS_PARAM )
1367 {
1368     flag aSign;
1369     int16 aExp, shiftCount;
1370     bits32 aSig;
1371     bits64 aSig64, aSigExtra;
1372
1373     aSig = extractFloat32Frac( a );
1374     aExp = extractFloat32Exp( a );
1375     aSign = extractFloat32Sign( a );
1376     shiftCount = 0xBE - aExp;
1377     if ( shiftCount < 0 ) {
1378         float_raise( float_flag_invalid STATUS_VAR);
1379         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1380             return LIT64( 0x7FFFFFFFFFFFFFFF );
1381         }
1382         return (sbits64) LIT64( 0x8000000000000000 );
1383     }
1384     if ( aExp ) aSig |= 0x00800000;
1385     aSig64 = aSig;
1386     aSig64 <<= 40;
1387     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1389
1390 }
1391
1392 /*----------------------------------------------------------------------------
1393 | Returns the result of converting the single-precision floating-point value
1394 | `a' to the 64-bit two's complement integer format.  The conversion is
1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1396 | Arithmetic, except that the conversion is always rounded toward zero.  If
1397 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1398 | conversion overflows, the largest integer with the same sign as `a' is
1399 | returned.
1400 *----------------------------------------------------------------------------*/
1401
1402 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1403 {
1404     flag aSign;
1405     int16 aExp, shiftCount;
1406     bits32 aSig;
1407     bits64 aSig64;
1408     int64 z;
1409
1410     aSig = extractFloat32Frac( a );
1411     aExp = extractFloat32Exp( a );
1412     aSign = extractFloat32Sign( a );
1413     shiftCount = aExp - 0xBE;
1414     if ( 0 <= shiftCount ) {
1415         if ( float32_val(a) != 0xDF000000 ) {
1416             float_raise( float_flag_invalid STATUS_VAR);
1417             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1418                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1419             }
1420         }
1421         return (sbits64) LIT64( 0x8000000000000000 );
1422     }
1423     else if ( aExp <= 0x7E ) {
1424         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1425         return 0;
1426     }
1427     aSig64 = aSig | 0x00800000;
1428     aSig64 <<= 40;
1429     z = aSig64>>( - shiftCount );
1430     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431         STATUS(float_exception_flags) |= float_flag_inexact;
1432     }
1433     if ( aSign ) z = - z;
1434     return z;
1435
1436 }
1437
1438 /*----------------------------------------------------------------------------
1439 | Returns the result of converting the single-precision floating-point value
1440 | `a' to the double-precision floating-point format.  The conversion is
1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1442 | Arithmetic.
1443 *----------------------------------------------------------------------------*/
1444
1445 float64 float32_to_float64( float32 a STATUS_PARAM )
1446 {
1447     flag aSign;
1448     int16 aExp;
1449     bits32 aSig;
1450
1451     aSig = extractFloat32Frac( a );
1452     aExp = extractFloat32Exp( a );
1453     aSign = extractFloat32Sign( a );
1454     if ( aExp == 0xFF ) {
1455         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456         return packFloat64( aSign, 0x7FF, 0 );
1457     }
1458     if ( aExp == 0 ) {
1459         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461         --aExp;
1462     }
1463     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1464
1465 }
1466
1467 #ifdef FLOATX80
1468
1469 /*----------------------------------------------------------------------------
1470 | Returns the result of converting the single-precision floating-point value
1471 | `a' to the extended double-precision floating-point format.  The conversion
1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473 | Arithmetic.
1474 *----------------------------------------------------------------------------*/
1475
1476 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1477 {
1478     flag aSign;
1479     int16 aExp;
1480     bits32 aSig;
1481
1482     aSig = extractFloat32Frac( a );
1483     aExp = extractFloat32Exp( a );
1484     aSign = extractFloat32Sign( a );
1485     if ( aExp == 0xFF ) {
1486         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1488     }
1489     if ( aExp == 0 ) {
1490         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1492     }
1493     aSig |= 0x00800000;
1494     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1495
1496 }
1497
1498 #endif
1499
1500 #ifdef FLOAT128
1501
1502 /*----------------------------------------------------------------------------
1503 | Returns the result of converting the single-precision floating-point value
1504 | `a' to the double-precision floating-point format.  The conversion is
1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1506 | Arithmetic.
1507 *----------------------------------------------------------------------------*/
1508
1509 float128 float32_to_float128( float32 a STATUS_PARAM )
1510 {
1511     flag aSign;
1512     int16 aExp;
1513     bits32 aSig;
1514
1515     aSig = extractFloat32Frac( a );
1516     aExp = extractFloat32Exp( a );
1517     aSign = extractFloat32Sign( a );
1518     if ( aExp == 0xFF ) {
1519         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520         return packFloat128( aSign, 0x7FFF, 0, 0 );
1521     }
1522     if ( aExp == 0 ) {
1523         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525         --aExp;
1526     }
1527     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1528
1529 }
1530
1531 #endif
1532
1533 /*----------------------------------------------------------------------------
1534 | Rounds the single-precision floating-point value `a' to an integer, and
1535 | returns the result as a single-precision floating-point value.  The
1536 | operation is performed according to the IEC/IEEE Standard for Binary
1537 | Floating-Point Arithmetic.
1538 *----------------------------------------------------------------------------*/
1539
1540 float32 float32_round_to_int( float32 a STATUS_PARAM)
1541 {
1542     flag aSign;
1543     int16 aExp;
1544     bits32 lastBitMask, roundBitsMask;
1545     int8 roundingMode;
1546     bits32 z;
1547
1548     aExp = extractFloat32Exp( a );
1549     if ( 0x96 <= aExp ) {
1550         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551             return propagateFloat32NaN( a, a STATUS_VAR );
1552         }
1553         return a;
1554     }
1555     if ( aExp <= 0x7E ) {
1556         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557         STATUS(float_exception_flags) |= float_flag_inexact;
1558         aSign = extractFloat32Sign( a );
1559         switch ( STATUS(float_rounding_mode) ) {
1560          case float_round_nearest_even:
1561             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562                 return packFloat32( aSign, 0x7F, 0 );
1563             }
1564             break;
1565          case float_round_down:
1566             return make_float32(aSign ? 0xBF800000 : 0);
1567          case float_round_up:
1568             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1569         }
1570         return packFloat32( aSign, 0, 0 );
1571     }
1572     lastBitMask = 1;
1573     lastBitMask <<= 0x96 - aExp;
1574     roundBitsMask = lastBitMask - 1;
1575     z = float32_val(a);
1576     roundingMode = STATUS(float_rounding_mode);
1577     if ( roundingMode == float_round_nearest_even ) {
1578         z += lastBitMask>>1;
1579         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1580     }
1581     else if ( roundingMode != float_round_to_zero ) {
1582         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583             z += roundBitsMask;
1584         }
1585     }
1586     z &= ~ roundBitsMask;
1587     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1588     return make_float32(z);
1589
1590 }
1591
1592 /*----------------------------------------------------------------------------
1593 | Returns the result of adding the absolute values of the single-precision
1594 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1595 | before being returned.  `zSign' is ignored if the result is a NaN.
1596 | The addition is performed according to the IEC/IEEE Standard for Binary
1597 | Floating-Point Arithmetic.
1598 *----------------------------------------------------------------------------*/
1599
1600 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1601 {
1602     int16 aExp, bExp, zExp;
1603     bits32 aSig, bSig, zSig;
1604     int16 expDiff;
1605
1606     aSig = extractFloat32Frac( a );
1607     aExp = extractFloat32Exp( a );
1608     bSig = extractFloat32Frac( b );
1609     bExp = extractFloat32Exp( b );
1610     expDiff = aExp - bExp;
1611     aSig <<= 6;
1612     bSig <<= 6;
1613     if ( 0 < expDiff ) {
1614         if ( aExp == 0xFF ) {
1615             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616             return a;
1617         }
1618         if ( bExp == 0 ) {
1619             --expDiff;
1620         }
1621         else {
1622             bSig |= 0x20000000;
1623         }
1624         shift32RightJamming( bSig, expDiff, &bSig );
1625         zExp = aExp;
1626     }
1627     else if ( expDiff < 0 ) {
1628         if ( bExp == 0xFF ) {
1629             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630             return packFloat32( zSign, 0xFF, 0 );
1631         }
1632         if ( aExp == 0 ) {
1633             ++expDiff;
1634         }
1635         else {
1636             aSig |= 0x20000000;
1637         }
1638         shift32RightJamming( aSig, - expDiff, &aSig );
1639         zExp = bExp;
1640     }
1641     else {
1642         if ( aExp == 0xFF ) {
1643             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644             return a;
1645         }
1646         if ( aExp == 0 ) {
1647             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1649         }
1650         zSig = 0x40000000 + aSig + bSig;
1651         zExp = aExp;
1652         goto roundAndPack;
1653     }
1654     aSig |= 0x20000000;
1655     zSig = ( aSig + bSig )<<1;
1656     --zExp;
1657     if ( (sbits32) zSig < 0 ) {
1658         zSig = aSig + bSig;
1659         ++zExp;
1660     }
1661  roundAndPack:
1662     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1663
1664 }
1665
1666 /*----------------------------------------------------------------------------
1667 | Returns the result of subtracting the absolute values of the single-
1668 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1669 | difference is negated before being returned.  `zSign' is ignored if the
1670 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1671 | Standard for Binary Floating-Point Arithmetic.
1672 *----------------------------------------------------------------------------*/
1673
1674 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1675 {
1676     int16 aExp, bExp, zExp;
1677     bits32 aSig, bSig, zSig;
1678     int16 expDiff;
1679
1680     aSig = extractFloat32Frac( a );
1681     aExp = extractFloat32Exp( a );
1682     bSig = extractFloat32Frac( b );
1683     bExp = extractFloat32Exp( b );
1684     expDiff = aExp - bExp;
1685     aSig <<= 7;
1686     bSig <<= 7;
1687     if ( 0 < expDiff ) goto aExpBigger;
1688     if ( expDiff < 0 ) goto bExpBigger;
1689     if ( aExp == 0xFF ) {
1690         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691         float_raise( float_flag_invalid STATUS_VAR);
1692         return float32_default_nan;
1693     }
1694     if ( aExp == 0 ) {
1695         aExp = 1;
1696         bExp = 1;
1697     }
1698     if ( bSig < aSig ) goto aBigger;
1699     if ( aSig < bSig ) goto bBigger;
1700     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701  bExpBigger:
1702     if ( bExp == 0xFF ) {
1703         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704         return packFloat32( zSign ^ 1, 0xFF, 0 );
1705     }
1706     if ( aExp == 0 ) {
1707         ++expDiff;
1708     }
1709     else {
1710         aSig |= 0x40000000;
1711     }
1712     shift32RightJamming( aSig, - expDiff, &aSig );
1713     bSig |= 0x40000000;
1714  bBigger:
1715     zSig = bSig - aSig;
1716     zExp = bExp;
1717     zSign ^= 1;
1718     goto normalizeRoundAndPack;
1719  aExpBigger:
1720     if ( aExp == 0xFF ) {
1721         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722         return a;
1723     }
1724     if ( bExp == 0 ) {
1725         --expDiff;
1726     }
1727     else {
1728         bSig |= 0x40000000;
1729     }
1730     shift32RightJamming( bSig, expDiff, &bSig );
1731     aSig |= 0x40000000;
1732  aBigger:
1733     zSig = aSig - bSig;
1734     zExp = aExp;
1735  normalizeRoundAndPack:
1736     --zExp;
1737     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1738
1739 }
1740
1741 /*----------------------------------------------------------------------------
1742 | Returns the result of adding the single-precision floating-point values `a'
1743 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1744 | Binary Floating-Point Arithmetic.
1745 *----------------------------------------------------------------------------*/
1746
1747 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1748 {
1749     flag aSign, bSign;
1750
1751     aSign = extractFloat32Sign( a );
1752     bSign = extractFloat32Sign( b );
1753     if ( aSign == bSign ) {
1754         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1755     }
1756     else {
1757         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1758     }
1759
1760 }
1761
1762 /*----------------------------------------------------------------------------
1763 | Returns the result of subtracting the single-precision floating-point values
1764 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1765 | for Binary Floating-Point Arithmetic.
1766 *----------------------------------------------------------------------------*/
1767
1768 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1769 {
1770     flag aSign, bSign;
1771
1772     aSign = extractFloat32Sign( a );
1773     bSign = extractFloat32Sign( b );
1774     if ( aSign == bSign ) {
1775         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1776     }
1777     else {
1778         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1779     }
1780
1781 }
1782
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of multiplying the single-precision floating-point values
1785 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1786 | for Binary Floating-Point Arithmetic.
1787 *----------------------------------------------------------------------------*/
1788
1789 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1790 {
1791     flag aSign, bSign, zSign;
1792     int16 aExp, bExp, zExp;
1793     bits32 aSig, bSig;
1794     bits64 zSig64;
1795     bits32 zSig;
1796
1797     aSig = extractFloat32Frac( a );
1798     aExp = extractFloat32Exp( a );
1799     aSign = extractFloat32Sign( a );
1800     bSig = extractFloat32Frac( b );
1801     bExp = extractFloat32Exp( b );
1802     bSign = extractFloat32Sign( b );
1803     zSign = aSign ^ bSign;
1804     if ( aExp == 0xFF ) {
1805         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1806             return propagateFloat32NaN( a, b STATUS_VAR );
1807         }
1808         if ( ( bExp | bSig ) == 0 ) {
1809             float_raise( float_flag_invalid STATUS_VAR);
1810             return float32_default_nan;
1811         }
1812         return packFloat32( zSign, 0xFF, 0 );
1813     }
1814     if ( bExp == 0xFF ) {
1815         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816         if ( ( aExp | aSig ) == 0 ) {
1817             float_raise( float_flag_invalid STATUS_VAR);
1818             return float32_default_nan;
1819         }
1820         return packFloat32( zSign, 0xFF, 0 );
1821     }
1822     if ( aExp == 0 ) {
1823         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1825     }
1826     if ( bExp == 0 ) {
1827         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1829     }
1830     zExp = aExp + bExp - 0x7F;
1831     aSig = ( aSig | 0x00800000 )<<7;
1832     bSig = ( bSig | 0x00800000 )<<8;
1833     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834     zSig = zSig64;
1835     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836         zSig <<= 1;
1837         --zExp;
1838     }
1839     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1840
1841 }
1842
1843 /*----------------------------------------------------------------------------
1844 | Returns the result of dividing the single-precision floating-point value `a'
1845 | by the corresponding value `b'.  The operation is performed according to the
1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847 *----------------------------------------------------------------------------*/
1848
1849 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1850 {
1851     flag aSign, bSign, zSign;
1852     int16 aExp, bExp, zExp;
1853     bits32 aSig, bSig, zSig;
1854
1855     aSig = extractFloat32Frac( a );
1856     aExp = extractFloat32Exp( a );
1857     aSign = extractFloat32Sign( a );
1858     bSig = extractFloat32Frac( b );
1859     bExp = extractFloat32Exp( b );
1860     bSign = extractFloat32Sign( b );
1861     zSign = aSign ^ bSign;
1862     if ( aExp == 0xFF ) {
1863         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864         if ( bExp == 0xFF ) {
1865             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866             float_raise( float_flag_invalid STATUS_VAR);
1867             return float32_default_nan;
1868         }
1869         return packFloat32( zSign, 0xFF, 0 );
1870     }
1871     if ( bExp == 0xFF ) {
1872         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873         return packFloat32( zSign, 0, 0 );
1874     }
1875     if ( bExp == 0 ) {
1876         if ( bSig == 0 ) {
1877             if ( ( aExp | aSig ) == 0 ) {
1878                 float_raise( float_flag_invalid STATUS_VAR);
1879                 return float32_default_nan;
1880             }
1881             float_raise( float_flag_divbyzero STATUS_VAR);
1882             return packFloat32( zSign, 0xFF, 0 );
1883         }
1884         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1885     }
1886     if ( aExp == 0 ) {
1887         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889     }
1890     zExp = aExp - bExp + 0x7D;
1891     aSig = ( aSig | 0x00800000 )<<7;
1892     bSig = ( bSig | 0x00800000 )<<8;
1893     if ( bSig <= ( aSig + aSig ) ) {
1894         aSig >>= 1;
1895         ++zExp;
1896     }
1897     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898     if ( ( zSig & 0x3F ) == 0 ) {
1899         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1900     }
1901     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1902
1903 }
1904
1905 /*----------------------------------------------------------------------------
1906 | Returns the remainder of the single-precision floating-point value `a'
1907 | with respect to the corresponding value `b'.  The operation is performed
1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909 *----------------------------------------------------------------------------*/
1910
1911 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1912 {
1913     flag aSign, zSign;
1914     int16 aExp, bExp, expDiff;
1915     bits32 aSig, bSig;
1916     bits32 q;
1917     bits64 aSig64, bSig64, q64;
1918     bits32 alternateASig;
1919     sbits32 sigMean;
1920
1921     aSig = extractFloat32Frac( a );
1922     aExp = extractFloat32Exp( a );
1923     aSign = extractFloat32Sign( a );
1924     bSig = extractFloat32Frac( b );
1925     bExp = extractFloat32Exp( b );
1926     if ( aExp == 0xFF ) {
1927         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1928             return propagateFloat32NaN( a, b STATUS_VAR );
1929         }
1930         float_raise( float_flag_invalid STATUS_VAR);
1931         return float32_default_nan;
1932     }
1933     if ( bExp == 0xFF ) {
1934         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935         return a;
1936     }
1937     if ( bExp == 0 ) {
1938         if ( bSig == 0 ) {
1939             float_raise( float_flag_invalid STATUS_VAR);
1940             return float32_default_nan;
1941         }
1942         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1943     }
1944     if ( aExp == 0 ) {
1945         if ( aSig == 0 ) return a;
1946         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1947     }
1948     expDiff = aExp - bExp;
1949     aSig |= 0x00800000;
1950     bSig |= 0x00800000;
1951     if ( expDiff < 32 ) {
1952         aSig <<= 8;
1953         bSig <<= 8;
1954         if ( expDiff < 0 ) {
1955             if ( expDiff < -1 ) return a;
1956             aSig >>= 1;
1957         }
1958         q = ( bSig <= aSig );
1959         if ( q ) aSig -= bSig;
1960         if ( 0 < expDiff ) {
1961             q = ( ( (bits64) aSig )<<32 ) / bSig;
1962             q >>= 32 - expDiff;
1963             bSig >>= 2;
1964             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1965         }
1966         else {
1967             aSig >>= 2;
1968             bSig >>= 2;
1969         }
1970     }
1971     else {
1972         if ( bSig <= aSig ) aSig -= bSig;
1973         aSig64 = ( (bits64) aSig )<<40;
1974         bSig64 = ( (bits64) bSig )<<40;
1975         expDiff -= 64;
1976         while ( 0 < expDiff ) {
1977             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1978             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1979             aSig64 = - ( ( bSig * q64 )<<38 );
1980             expDiff -= 62;
1981         }
1982         expDiff += 64;
1983         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1984         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1985         q = q64>>( 64 - expDiff );
1986         bSig <<= 6;
1987         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1988     }
1989     do {
1990         alternateASig = aSig;
1991         ++q;
1992         aSig -= bSig;
1993     } while ( 0 <= (sbits32) aSig );
1994     sigMean = aSig + alternateASig;
1995     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1996         aSig = alternateASig;
1997     }
1998     zSign = ( (sbits32) aSig < 0 );
1999     if ( zSign ) aSig = - aSig;
2000     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2001
2002 }
2003
2004 /*----------------------------------------------------------------------------
2005 | Returns the square root of the single-precision floating-point value `a'.
2006 | The operation is performed according to the IEC/IEEE Standard for Binary
2007 | Floating-Point Arithmetic.
2008 *----------------------------------------------------------------------------*/
2009
2010 float32 float32_sqrt( float32 a STATUS_PARAM )
2011 {
2012     flag aSign;
2013     int16 aExp, zExp;
2014     bits32 aSig, zSig;
2015     bits64 rem, term;
2016
2017     aSig = extractFloat32Frac( a );
2018     aExp = extractFloat32Exp( a );
2019     aSign = extractFloat32Sign( a );
2020     if ( aExp == 0xFF ) {
2021         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2022         if ( ! aSign ) return a;
2023         float_raise( float_flag_invalid STATUS_VAR);
2024         return float32_default_nan;
2025     }
2026     if ( aSign ) {
2027         if ( ( aExp | aSig ) == 0 ) return a;
2028         float_raise( float_flag_invalid STATUS_VAR);
2029         return float32_default_nan;
2030     }
2031     if ( aExp == 0 ) {
2032         if ( aSig == 0 ) return float32_zero;
2033         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2034     }
2035     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2036     aSig = ( aSig | 0x00800000 )<<8;
2037     zSig = estimateSqrt32( aExp, aSig ) + 2;
2038     if ( ( zSig & 0x7F ) <= 5 ) {
2039         if ( zSig < 2 ) {
2040             zSig = 0x7FFFFFFF;
2041             goto roundAndPack;
2042         }
2043         aSig >>= aExp & 1;
2044         term = ( (bits64) zSig ) * zSig;
2045         rem = ( ( (bits64) aSig )<<32 ) - term;
2046         while ( (sbits64) rem < 0 ) {
2047             --zSig;
2048             rem += ( ( (bits64) zSig )<<1 ) | 1;
2049         }
2050         zSig |= ( rem != 0 );
2051     }
2052     shift32RightJamming( zSig, 1, &zSig );
2053  roundAndPack:
2054     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2055
2056 }
2057
2058 /*----------------------------------------------------------------------------
2059 | Returns the binary log of the single-precision floating-point value `a'.
2060 | The operation is performed according to the IEC/IEEE Standard for Binary
2061 | Floating-Point Arithmetic.
2062 *----------------------------------------------------------------------------*/
2063 float32 float32_log2( float32 a STATUS_PARAM )
2064 {
2065     flag aSign, zSign;
2066     int16 aExp;
2067     bits32 aSig, zSig, i;
2068
2069     aSig = extractFloat32Frac( a );
2070     aExp = extractFloat32Exp( a );
2071     aSign = extractFloat32Sign( a );
2072
2073     if ( aExp == 0 ) {
2074         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2075         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2076     }
2077     if ( aSign ) {
2078         float_raise( float_flag_invalid STATUS_VAR);
2079         return float32_default_nan;
2080     }
2081     if ( aExp == 0xFF ) {
2082         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2083         return a;
2084     }
2085
2086     aExp -= 0x7F;
2087     aSig |= 0x00800000;
2088     zSign = aExp < 0;
2089     zSig = aExp << 23;
2090
2091     for (i = 1 << 22; i > 0; i >>= 1) {
2092         aSig = ( (bits64)aSig * aSig ) >> 23;
2093         if ( aSig & 0x01000000 ) {
2094             aSig >>= 1;
2095             zSig |= i;
2096         }
2097     }
2098
2099     if ( zSign )
2100         zSig = -zSig;
2101
2102     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2103 }
2104
2105 /*----------------------------------------------------------------------------
2106 | Returns 1 if the single-precision floating-point value `a' is equal to
2107 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2108 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2109 *----------------------------------------------------------------------------*/
2110
2111 int float32_eq( float32 a, float32 b STATUS_PARAM )
2112 {
2113
2114     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2115          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2116        ) {
2117         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2118             float_raise( float_flag_invalid STATUS_VAR);
2119         }
2120         return 0;
2121     }
2122     return ( float32_val(a) == float32_val(b) ) ||
2123             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2124
2125 }
2126
2127 /*----------------------------------------------------------------------------
2128 | Returns 1 if the single-precision floating-point value `a' is less than
2129 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2130 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2131 | Arithmetic.
2132 *----------------------------------------------------------------------------*/
2133
2134 int float32_le( float32 a, float32 b STATUS_PARAM )
2135 {
2136     flag aSign, bSign;
2137     bits32 av, bv;
2138
2139     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2140          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2141        ) {
2142         float_raise( float_flag_invalid STATUS_VAR);
2143         return 0;
2144     }
2145     aSign = extractFloat32Sign( a );
2146     bSign = extractFloat32Sign( b );
2147     av = float32_val(a);
2148     bv = float32_val(b);
2149     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2150     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2151
2152 }
2153
2154 /*----------------------------------------------------------------------------
2155 | Returns 1 if the single-precision floating-point value `a' is less than
2156 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2157 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2158 *----------------------------------------------------------------------------*/
2159
2160 int float32_lt( float32 a, float32 b STATUS_PARAM )
2161 {
2162     flag aSign, bSign;
2163     bits32 av, bv;
2164
2165     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2166          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2167        ) {
2168         float_raise( float_flag_invalid STATUS_VAR);
2169         return 0;
2170     }
2171     aSign = extractFloat32Sign( a );
2172     bSign = extractFloat32Sign( b );
2173     av = float32_val(a);
2174     bv = float32_val(b);
2175     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2176     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2177
2178 }
2179
2180 /*----------------------------------------------------------------------------
2181 | Returns 1 if the single-precision floating-point value `a' is equal to
2182 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2183 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2184 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185 *----------------------------------------------------------------------------*/
2186
2187 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2188 {
2189     bits32 av, bv;
2190
2191     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2192          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2193        ) {
2194         float_raise( float_flag_invalid STATUS_VAR);
2195         return 0;
2196     }
2197     av = float32_val(a);
2198     bv = float32_val(b);
2199     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2200
2201 }
2202
2203 /*----------------------------------------------------------------------------
2204 | Returns 1 if the single-precision floating-point value `a' is less than or
2205 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2206 | cause an exception.  Otherwise, the comparison is performed according to the
2207 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2208 *----------------------------------------------------------------------------*/
2209
2210 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2211 {
2212     flag aSign, bSign;
2213     bits32 av, bv;
2214
2215     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2216          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2217        ) {
2218         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2219             float_raise( float_flag_invalid STATUS_VAR);
2220         }
2221         return 0;
2222     }
2223     aSign = extractFloat32Sign( a );
2224     bSign = extractFloat32Sign( b );
2225     av = float32_val(a);
2226     bv = float32_val(b);
2227     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2228     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2229
2230 }
2231
2232 /*----------------------------------------------------------------------------
2233 | Returns 1 if the single-precision floating-point value `a' is less than
2234 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2235 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2236 | Standard for Binary Floating-Point Arithmetic.
2237 *----------------------------------------------------------------------------*/
2238
2239 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2240 {
2241     flag aSign, bSign;
2242     bits32 av, bv;
2243
2244     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2245          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2246        ) {
2247         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2248             float_raise( float_flag_invalid STATUS_VAR);
2249         }
2250         return 0;
2251     }
2252     aSign = extractFloat32Sign( a );
2253     bSign = extractFloat32Sign( b );
2254     av = float32_val(a);
2255     bv = float32_val(b);
2256     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2257     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2258
2259 }
2260
2261 /*----------------------------------------------------------------------------
2262 | Returns the result of converting the double-precision floating-point value
2263 | `a' to the 32-bit two's complement integer format.  The conversion is
2264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2265 | Arithmetic---which means in particular that the conversion is rounded
2266 | according to the current rounding mode.  If `a' is a NaN, the largest
2267 | positive integer is returned.  Otherwise, if the conversion overflows, the
2268 | largest integer with the same sign as `a' is returned.
2269 *----------------------------------------------------------------------------*/
2270
2271 int32 float64_to_int32( float64 a STATUS_PARAM )
2272 {
2273     flag aSign;
2274     int16 aExp, shiftCount;
2275     bits64 aSig;
2276
2277     aSig = extractFloat64Frac( a );
2278     aExp = extractFloat64Exp( a );
2279     aSign = extractFloat64Sign( a );
2280     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2281     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2282     shiftCount = 0x42C - aExp;
2283     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2284     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2285
2286 }
2287
2288 /*----------------------------------------------------------------------------
2289 | Returns the result of converting the double-precision floating-point value
2290 | `a' to the 32-bit two's complement integer format.  The conversion is
2291 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2292 | Arithmetic, except that the conversion is always rounded toward zero.
2293 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2294 | the conversion overflows, the largest integer with the same sign as `a' is
2295 | returned.
2296 *----------------------------------------------------------------------------*/
2297
2298 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2299 {
2300     flag aSign;
2301     int16 aExp, shiftCount;
2302     bits64 aSig, savedASig;
2303     int32 z;
2304
2305     aSig = extractFloat64Frac( a );
2306     aExp = extractFloat64Exp( a );
2307     aSign = extractFloat64Sign( a );
2308     if ( 0x41E < aExp ) {
2309         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2310         goto invalid;
2311     }
2312     else if ( aExp < 0x3FF ) {
2313         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2314         return 0;
2315     }
2316     aSig |= LIT64( 0x0010000000000000 );
2317     shiftCount = 0x433 - aExp;
2318     savedASig = aSig;
2319     aSig >>= shiftCount;
2320     z = aSig;
2321     if ( aSign ) z = - z;
2322     if ( ( z < 0 ) ^ aSign ) {
2323  invalid:
2324         float_raise( float_flag_invalid STATUS_VAR);
2325         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2326     }
2327     if ( ( aSig<<shiftCount ) != savedASig ) {
2328         STATUS(float_exception_flags) |= float_flag_inexact;
2329     }
2330     return z;
2331
2332 }
2333
2334 /*----------------------------------------------------------------------------
2335 | Returns the result of converting the double-precision floating-point value
2336 | `a' to the 64-bit two's complement integer format.  The conversion is
2337 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2338 | Arithmetic---which means in particular that the conversion is rounded
2339 | according to the current rounding mode.  If `a' is a NaN, the largest
2340 | positive integer is returned.  Otherwise, if the conversion overflows, the
2341 | largest integer with the same sign as `a' is returned.
2342 *----------------------------------------------------------------------------*/
2343
2344 int64 float64_to_int64( float64 a STATUS_PARAM )
2345 {
2346     flag aSign;
2347     int16 aExp, shiftCount;
2348     bits64 aSig, aSigExtra;
2349
2350     aSig = extractFloat64Frac( a );
2351     aExp = extractFloat64Exp( a );
2352     aSign = extractFloat64Sign( a );
2353     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2354     shiftCount = 0x433 - aExp;
2355     if ( shiftCount <= 0 ) {
2356         if ( 0x43E < aExp ) {
2357             float_raise( float_flag_invalid STATUS_VAR);
2358             if (    ! aSign
2359                  || (    ( aExp == 0x7FF )
2360                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2361                ) {
2362                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2363             }
2364             return (sbits64) LIT64( 0x8000000000000000 );
2365         }
2366         aSigExtra = 0;
2367         aSig <<= - shiftCount;
2368     }
2369     else {
2370         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2371     }
2372     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2373
2374 }
2375
2376 /*----------------------------------------------------------------------------
2377 | Returns the result of converting the double-precision floating-point value
2378 | `a' to the 64-bit two's complement integer format.  The conversion is
2379 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2380 | Arithmetic, except that the conversion is always rounded toward zero.
2381 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2382 | the conversion overflows, the largest integer with the same sign as `a' is
2383 | returned.
2384 *----------------------------------------------------------------------------*/
2385
2386 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2387 {
2388     flag aSign;
2389     int16 aExp, shiftCount;
2390     bits64 aSig;
2391     int64 z;
2392
2393     aSig = extractFloat64Frac( a );
2394     aExp = extractFloat64Exp( a );
2395     aSign = extractFloat64Sign( a );
2396     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2397     shiftCount = aExp - 0x433;
2398     if ( 0 <= shiftCount ) {
2399         if ( 0x43E <= aExp ) {
2400             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2401                 float_raise( float_flag_invalid STATUS_VAR);
2402                 if (    ! aSign
2403                      || (    ( aExp == 0x7FF )
2404                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2405                    ) {
2406                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2407                 }
2408             }
2409             return (sbits64) LIT64( 0x8000000000000000 );
2410         }
2411         z = aSig<<shiftCount;
2412     }
2413     else {
2414         if ( aExp < 0x3FE ) {
2415             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2416             return 0;
2417         }
2418         z = aSig>>( - shiftCount );
2419         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2420             STATUS(float_exception_flags) |= float_flag_inexact;
2421         }
2422     }
2423     if ( aSign ) z = - z;
2424     return z;
2425
2426 }
2427
2428 /*----------------------------------------------------------------------------
2429 | Returns the result of converting the double-precision floating-point value
2430 | `a' to the single-precision floating-point format.  The conversion is
2431 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2432 | Arithmetic.
2433 *----------------------------------------------------------------------------*/
2434
2435 float32 float64_to_float32( float64 a STATUS_PARAM )
2436 {
2437     flag aSign;
2438     int16 aExp;
2439     bits64 aSig;
2440     bits32 zSig;
2441
2442     aSig = extractFloat64Frac( a );
2443     aExp = extractFloat64Exp( a );
2444     aSign = extractFloat64Sign( a );
2445     if ( aExp == 0x7FF ) {
2446         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2447         return packFloat32( aSign, 0xFF, 0 );
2448     }
2449     shift64RightJamming( aSig, 22, &aSig );
2450     zSig = aSig;
2451     if ( aExp || zSig ) {
2452         zSig |= 0x40000000;
2453         aExp -= 0x381;
2454     }
2455     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2456
2457 }
2458
2459
2460 /*----------------------------------------------------------------------------
2461 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2462 | half-precision floating-point value, returning the result.  After being
2463 | shifted into the proper positions, the three fields are simply added
2464 | together to form the result.  This means that any integer portion of `zSig'
2465 | will be added into the exponent.  Since a properly normalized significand
2466 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2467 | than the desired result exponent whenever `zSig' is a complete, normalized
2468 | significand.
2469 *----------------------------------------------------------------------------*/
2470 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2471 {
2472     return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2473 }
2474
2475 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2476    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2477
2478 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2479 {
2480     flag aSign;
2481     int16 aExp;
2482     bits32 aSig;
2483
2484     aSign = a >> 15;
2485     aExp = (a >> 10) & 0x1f;
2486     aSig = a & 0x3ff;
2487
2488     if (aExp == 0x1f && ieee) {
2489         if (aSig) {
2490             /* Make sure correct exceptions are raised.  */
2491             float32ToCommonNaN(a STATUS_VAR);
2492             aSig |= 0x200;
2493         }
2494         return packFloat32(aSign, 0xff, aSig << 13);
2495     }
2496     if (aExp == 0) {
2497         int8 shiftCount;
2498
2499         if (aSig == 0) {
2500             return packFloat32(aSign, 0, 0);
2501         }
2502
2503         shiftCount = countLeadingZeros32( aSig ) - 21;
2504         aSig = aSig << shiftCount;
2505         aExp = -shiftCount;
2506     }
2507     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2508 }
2509
2510 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2511 {
2512     flag aSign;
2513     int16 aExp;
2514     bits32 aSig;
2515     bits32 mask;
2516     bits32 increment;
2517     int8 roundingMode;
2518
2519     aSig = extractFloat32Frac( a );
2520     aExp = extractFloat32Exp( a );
2521     aSign = extractFloat32Sign( a );
2522     if ( aExp == 0xFF ) {
2523         if (aSig) {
2524             /* Make sure correct exceptions are raised.  */
2525             float32ToCommonNaN(a STATUS_VAR);
2526             aSig |= 0x00400000;
2527         }
2528         return packFloat16(aSign, 0x1f, aSig >> 13);
2529     }
2530     if (aExp == 0 && aSign == 0) {
2531         return packFloat16(aSign, 0, 0);
2532     }
2533     /* Decimal point between bits 22 and 23.  */
2534     aSig |= 0x00800000;
2535     aExp -= 0x7f;
2536     if (aExp < -14) {
2537         mask = 0x007fffff;
2538         if (aExp < -24) {
2539             aExp = -25;
2540         } else {
2541             mask >>= 24 + aExp;
2542         }
2543     } else {
2544         mask = 0x00001fff;
2545     }
2546     if (aSig & mask) {
2547         float_raise( float_flag_underflow STATUS_VAR );
2548         roundingMode = STATUS(float_rounding_mode);
2549         switch (roundingMode) {
2550         case float_round_nearest_even:
2551             increment = (mask + 1) >> 1;
2552             if ((aSig & mask) == increment) {
2553                 increment = aSig & (increment << 1);
2554             }
2555             break;
2556         case float_round_up:
2557             increment = aSign ? 0 : mask;
2558             break;
2559         case float_round_down:
2560             increment = aSign ? mask : 0;
2561             break;
2562         default: /* round_to_zero */
2563             increment = 0;
2564             break;
2565         }
2566         aSig += increment;
2567         if (aSig >= 0x01000000) {
2568             aSig >>= 1;
2569             aExp++;
2570         }
2571     } else if (aExp < -14
2572           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2573         float_raise( float_flag_underflow STATUS_VAR);
2574     }
2575
2576     if (ieee) {
2577         if (aExp > 15) {
2578             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2579             return packFloat16(aSign, 0x1f, 0);
2580         }
2581     } else {
2582         if (aExp > 16) {
2583             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2584             return packFloat16(aSign, 0x1f, 0x3ff);
2585         }
2586     }
2587     if (aExp < -24) {
2588         return packFloat16(aSign, 0, 0);
2589     }
2590     if (aExp < -14) {
2591         aSig >>= -14 - aExp;
2592         aExp = -14;
2593     }
2594     return packFloat16(aSign, aExp + 14, aSig >> 13);
2595 }
2596
2597 #ifdef FLOATX80
2598
2599 /*----------------------------------------------------------------------------
2600 | Returns the result of converting the double-precision floating-point value
2601 | `a' to the extended double-precision floating-point format.  The conversion
2602 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2603 | Arithmetic.
2604 *----------------------------------------------------------------------------*/
2605
2606 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2607 {
2608     flag aSign;
2609     int16 aExp;
2610     bits64 aSig;
2611
2612     aSig = extractFloat64Frac( a );
2613     aExp = extractFloat64Exp( a );
2614     aSign = extractFloat64Sign( a );
2615     if ( aExp == 0x7FF ) {
2616         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2617         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2618     }
2619     if ( aExp == 0 ) {
2620         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2621         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2622     }
2623     return
2624         packFloatx80(
2625             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2626
2627 }
2628
2629 #endif
2630
2631 #ifdef FLOAT128
2632
2633 /*----------------------------------------------------------------------------
2634 | Returns the result of converting the double-precision floating-point value
2635 | `a' to the quadruple-precision floating-point format.  The conversion is
2636 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2637 | Arithmetic.
2638 *----------------------------------------------------------------------------*/
2639
2640 float128 float64_to_float128( float64 a STATUS_PARAM )
2641 {
2642     flag aSign;
2643     int16 aExp;
2644     bits64 aSig, zSig0, zSig1;
2645
2646     aSig = extractFloat64Frac( a );
2647     aExp = extractFloat64Exp( a );
2648     aSign = extractFloat64Sign( a );
2649     if ( aExp == 0x7FF ) {
2650         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2651         return packFloat128( aSign, 0x7FFF, 0, 0 );
2652     }
2653     if ( aExp == 0 ) {
2654         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2655         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2656         --aExp;
2657     }
2658     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2659     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2660
2661 }
2662
2663 #endif
2664
2665 /*----------------------------------------------------------------------------
2666 | Rounds the double-precision floating-point value `a' to an integer, and
2667 | returns the result as a double-precision floating-point value.  The
2668 | operation is performed according to the IEC/IEEE Standard for Binary
2669 | Floating-Point Arithmetic.
2670 *----------------------------------------------------------------------------*/
2671
2672 float64 float64_round_to_int( float64 a STATUS_PARAM )
2673 {
2674     flag aSign;
2675     int16 aExp;
2676     bits64 lastBitMask, roundBitsMask;
2677     int8 roundingMode;
2678     bits64 z;
2679
2680     aExp = extractFloat64Exp( a );
2681     if ( 0x433 <= aExp ) {
2682         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2683             return propagateFloat64NaN( a, a STATUS_VAR );
2684         }
2685         return a;
2686     }
2687     if ( aExp < 0x3FF ) {
2688         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2689         STATUS(float_exception_flags) |= float_flag_inexact;
2690         aSign = extractFloat64Sign( a );
2691         switch ( STATUS(float_rounding_mode) ) {
2692          case float_round_nearest_even:
2693             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2694                 return packFloat64( aSign, 0x3FF, 0 );
2695             }
2696             break;
2697          case float_round_down:
2698             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2699          case float_round_up:
2700             return make_float64(
2701             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2702         }
2703         return packFloat64( aSign, 0, 0 );
2704     }
2705     lastBitMask = 1;
2706     lastBitMask <<= 0x433 - aExp;
2707     roundBitsMask = lastBitMask - 1;
2708     z = float64_val(a);
2709     roundingMode = STATUS(float_rounding_mode);
2710     if ( roundingMode == float_round_nearest_even ) {
2711         z += lastBitMask>>1;
2712         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2713     }
2714     else if ( roundingMode != float_round_to_zero ) {
2715         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2716             z += roundBitsMask;
2717         }
2718     }
2719     z &= ~ roundBitsMask;
2720     if ( z != float64_val(a) )
2721         STATUS(float_exception_flags) |= float_flag_inexact;
2722     return make_float64(z);
2723
2724 }
2725
2726 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2727 {
2728     int oldmode;
2729     float64 res;
2730     oldmode = STATUS(float_rounding_mode);
2731     STATUS(float_rounding_mode) = float_round_to_zero;
2732     res = float64_round_to_int(a STATUS_VAR);
2733     STATUS(float_rounding_mode) = oldmode;
2734     return res;
2735 }
2736
2737 /*----------------------------------------------------------------------------
2738 | Returns the result of adding the absolute values of the double-precision
2739 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2740 | before being returned.  `zSign' is ignored if the result is a NaN.
2741 | The addition is performed according to the IEC/IEEE Standard for Binary
2742 | Floating-Point Arithmetic.
2743 *----------------------------------------------------------------------------*/
2744
2745 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2746 {
2747     int16 aExp, bExp, zExp;
2748     bits64 aSig, bSig, zSig;
2749     int16 expDiff;
2750
2751     aSig = extractFloat64Frac( a );
2752     aExp = extractFloat64Exp( a );
2753     bSig = extractFloat64Frac( b );
2754     bExp = extractFloat64Exp( b );
2755     expDiff = aExp - bExp;
2756     aSig <<= 9;
2757     bSig <<= 9;
2758     if ( 0 < expDiff ) {
2759         if ( aExp == 0x7FF ) {
2760             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2761             return a;
2762         }
2763         if ( bExp == 0 ) {
2764             --expDiff;
2765         }
2766         else {
2767             bSig |= LIT64( 0x2000000000000000 );
2768         }
2769         shift64RightJamming( bSig, expDiff, &bSig );
2770         zExp = aExp;
2771     }
2772     else if ( expDiff < 0 ) {
2773         if ( bExp == 0x7FF ) {
2774             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2775             return packFloat64( zSign, 0x7FF, 0 );
2776         }
2777         if ( aExp == 0 ) {
2778             ++expDiff;
2779         }
2780         else {
2781             aSig |= LIT64( 0x2000000000000000 );
2782         }
2783         shift64RightJamming( aSig, - expDiff, &aSig );
2784         zExp = bExp;
2785     }
2786     else {
2787         if ( aExp == 0x7FF ) {
2788             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2789             return a;
2790         }
2791         if ( aExp == 0 ) {
2792             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2793             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2794         }
2795         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2796         zExp = aExp;
2797         goto roundAndPack;
2798     }
2799     aSig |= LIT64( 0x2000000000000000 );
2800     zSig = ( aSig + bSig )<<1;
2801     --zExp;
2802     if ( (sbits64) zSig < 0 ) {
2803         zSig = aSig + bSig;
2804         ++zExp;
2805     }
2806  roundAndPack:
2807     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2808
2809 }
2810
2811 /*----------------------------------------------------------------------------
2812 | Returns the result of subtracting the absolute values of the double-
2813 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2814 | difference is negated before being returned.  `zSign' is ignored if the
2815 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2816 | Standard for Binary Floating-Point Arithmetic.
2817 *----------------------------------------------------------------------------*/
2818
2819 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2820 {
2821     int16 aExp, bExp, zExp;
2822     bits64 aSig, bSig, zSig;
2823     int16 expDiff;
2824
2825     aSig = extractFloat64Frac( a );
2826     aExp = extractFloat64Exp( a );
2827     bSig = extractFloat64Frac( b );
2828     bExp = extractFloat64Exp( b );
2829     expDiff = aExp - bExp;
2830     aSig <<= 10;
2831     bSig <<= 10;
2832     if ( 0 < expDiff ) goto aExpBigger;
2833     if ( expDiff < 0 ) goto bExpBigger;
2834     if ( aExp == 0x7FF ) {
2835         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2836         float_raise( float_flag_invalid STATUS_VAR);
2837         return float64_default_nan;
2838     }
2839     if ( aExp == 0 ) {
2840         aExp = 1;
2841         bExp = 1;
2842     }
2843     if ( bSig < aSig ) goto aBigger;
2844     if ( aSig < bSig ) goto bBigger;
2845     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2846  bExpBigger:
2847     if ( bExp == 0x7FF ) {
2848         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2849         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2850     }
2851     if ( aExp == 0 ) {
2852         ++expDiff;
2853     }
2854     else {
2855         aSig |= LIT64( 0x4000000000000000 );
2856     }
2857     shift64RightJamming( aSig, - expDiff, &aSig );
2858     bSig |= LIT64( 0x4000000000000000 );
2859  bBigger:
2860     zSig = bSig - aSig;
2861     zExp = bExp;
2862     zSign ^= 1;
2863     goto normalizeRoundAndPack;
2864  aExpBigger:
2865     if ( aExp == 0x7FF ) {
2866         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2867         return a;
2868     }
2869     if ( bExp == 0 ) {
2870         --expDiff;
2871     }
2872     else {
2873         bSig |= LIT64( 0x4000000000000000 );
2874     }
2875     shift64RightJamming( bSig, expDiff, &bSig );
2876     aSig |= LIT64( 0x4000000000000000 );
2877  aBigger:
2878     zSig = aSig - bSig;
2879     zExp = aExp;
2880  normalizeRoundAndPack:
2881     --zExp;
2882     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2883
2884 }
2885
2886 /*----------------------------------------------------------------------------
2887 | Returns the result of adding the double-precision floating-point values `a'
2888 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2889 | Binary Floating-Point Arithmetic.
2890 *----------------------------------------------------------------------------*/
2891
2892 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2893 {
2894     flag aSign, bSign;
2895
2896     aSign = extractFloat64Sign( a );
2897     bSign = extractFloat64Sign( b );
2898     if ( aSign == bSign ) {
2899         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2900     }
2901     else {
2902         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2903     }
2904
2905 }
2906
2907 /*----------------------------------------------------------------------------
2908 | Returns the result of subtracting the double-precision floating-point values
2909 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2910 | for Binary Floating-Point Arithmetic.
2911 *----------------------------------------------------------------------------*/
2912
2913 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2914 {
2915     flag aSign, bSign;
2916
2917     aSign = extractFloat64Sign( a );
2918     bSign = extractFloat64Sign( b );
2919     if ( aSign == bSign ) {
2920         return subFloat64Sigs( a, b, aSign STATUS_VAR );
2921     }
2922     else {
2923         return addFloat64Sigs( a, b, aSign STATUS_VAR );
2924     }
2925
2926 }
2927
2928 /*----------------------------------------------------------------------------
2929 | Returns the result of multiplying the double-precision floating-point values
2930 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2931 | for Binary Floating-Point Arithmetic.
2932 *----------------------------------------------------------------------------*/
2933
2934 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2935 {
2936     flag aSign, bSign, zSign;
2937     int16 aExp, bExp, zExp;
2938     bits64 aSig, bSig, zSig0, zSig1;
2939
2940     aSig = extractFloat64Frac( a );
2941     aExp = extractFloat64Exp( a );
2942     aSign = extractFloat64Sign( a );
2943     bSig = extractFloat64Frac( b );
2944     bExp = extractFloat64Exp( b );
2945     bSign = extractFloat64Sign( b );
2946     zSign = aSign ^ bSign;
2947     if ( aExp == 0x7FF ) {
2948         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2949             return propagateFloat64NaN( a, b STATUS_VAR );
2950         }
2951         if ( ( bExp | bSig ) == 0 ) {
2952             float_raise( float_flag_invalid STATUS_VAR);
2953             return float64_default_nan;
2954         }
2955         return packFloat64( zSign, 0x7FF, 0 );
2956     }
2957     if ( bExp == 0x7FF ) {
2958         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2959         if ( ( aExp | aSig ) == 0 ) {
2960             float_raise( float_flag_invalid STATUS_VAR);
2961             return float64_default_nan;
2962         }
2963         return packFloat64( zSign, 0x7FF, 0 );
2964     }
2965     if ( aExp == 0 ) {
2966         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2967         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2968     }
2969     if ( bExp == 0 ) {
2970         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2971         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2972     }
2973     zExp = aExp + bExp - 0x3FF;
2974     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2975     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2976     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2977     zSig0 |= ( zSig1 != 0 );
2978     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2979         zSig0 <<= 1;
2980         --zExp;
2981     }
2982     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2983
2984 }
2985
2986 /*----------------------------------------------------------------------------
2987 | Returns the result of dividing the double-precision floating-point value `a'
2988 | by the corresponding value `b'.  The operation is performed according to
2989 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2990 *----------------------------------------------------------------------------*/
2991
2992 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2993 {
2994     flag aSign, bSign, zSign;
2995     int16 aExp, bExp, zExp;
2996     bits64 aSig, bSig, zSig;
2997     bits64 rem0, rem1;
2998     bits64 term0, term1;
2999
3000     aSig = extractFloat64Frac( a );
3001     aExp = extractFloat64Exp( a );
3002     aSign = extractFloat64Sign( a );
3003     bSig = extractFloat64Frac( b );
3004     bExp = extractFloat64Exp( b );
3005     bSign = extractFloat64Sign( b );
3006     zSign = aSign ^ bSign;
3007     if ( aExp == 0x7FF ) {
3008         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3009         if ( bExp == 0x7FF ) {
3010             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3011             float_raise( float_flag_invalid STATUS_VAR);
3012             return float64_default_nan;
3013         }
3014         return packFloat64( zSign, 0x7FF, 0 );
3015     }
3016     if ( bExp == 0x7FF ) {
3017         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3018         return packFloat64( zSign, 0, 0 );
3019     }
3020     if ( bExp == 0 ) {
3021         if ( bSig == 0 ) {
3022             if ( ( aExp | aSig ) == 0 ) {
3023                 float_raise( float_flag_invalid STATUS_VAR);
3024                 return float64_default_nan;
3025             }
3026             float_raise( float_flag_divbyzero STATUS_VAR);
3027             return packFloat64( zSign, 0x7FF, 0 );
3028         }
3029         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3030     }
3031     if ( aExp == 0 ) {
3032         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3033         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3034     }
3035     zExp = aExp - bExp + 0x3FD;
3036     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3037     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3038     if ( bSig <= ( aSig + aSig ) ) {
3039         aSig >>= 1;
3040         ++zExp;
3041     }
3042     zSig = estimateDiv128To64( aSig, 0, bSig );
3043     if ( ( zSig & 0x1FF ) <= 2 ) {
3044         mul64To128( bSig, zSig, &term0, &term1 );
3045         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3046         while ( (sbits64) rem0 < 0 ) {
3047             --zSig;
3048             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3049         }
3050         zSig |= ( rem1 != 0 );
3051     }
3052     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3053
3054 }
3055
3056 /*----------------------------------------------------------------------------
3057 | Returns the remainder of the double-precision floating-point value `a'
3058 | with respect to the corresponding value `b'.  The operation is performed
3059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3060 *----------------------------------------------------------------------------*/
3061
3062 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3063 {
3064     flag aSign, zSign;
3065     int16 aExp, bExp, expDiff;
3066     bits64 aSig, bSig;
3067     bits64 q, alternateASig;
3068     sbits64 sigMean;
3069
3070     aSig = extractFloat64Frac( a );
3071     aExp = extractFloat64Exp( a );
3072     aSign = extractFloat64Sign( a );
3073     bSig = extractFloat64Frac( b );
3074     bExp = extractFloat64Exp( b );
3075     if ( aExp == 0x7FF ) {
3076         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3077             return propagateFloat64NaN( a, b STATUS_VAR );
3078         }
3079         float_raise( float_flag_invalid STATUS_VAR);
3080         return float64_default_nan;
3081     }
3082     if ( bExp == 0x7FF ) {
3083         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3084         return a;
3085     }
3086     if ( bExp == 0 ) {
3087         if ( bSig == 0 ) {
3088             float_raise( float_flag_invalid STATUS_VAR);
3089             return float64_default_nan;
3090         }
3091         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3092     }
3093     if ( aExp == 0 ) {
3094         if ( aSig == 0 ) return a;
3095         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3096     }
3097     expDiff = aExp - bExp;
3098     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3099     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3100     if ( expDiff < 0 ) {
3101         if ( expDiff < -1 ) return a;
3102         aSig >>= 1;
3103     }
3104     q = ( bSig <= aSig );
3105     if ( q ) aSig -= bSig;
3106     expDiff -= 64;
3107     while ( 0 < expDiff ) {
3108         q = estimateDiv128To64( aSig, 0, bSig );
3109         q = ( 2 < q ) ? q - 2 : 0;
3110         aSig = - ( ( bSig>>2 ) * q );
3111         expDiff -= 62;
3112     }
3113     expDiff += 64;
3114     if ( 0 < expDiff ) {
3115         q = estimateDiv128To64( aSig, 0, bSig );
3116         q = ( 2 < q ) ? q - 2 : 0;
3117         q >>= 64 - expDiff;
3118         bSig >>= 2;
3119         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3120     }
3121     else {
3122         aSig >>= 2;
3123         bSig >>= 2;
3124     }
3125     do {
3126         alternateASig = aSig;
3127         ++q;
3128         aSig -= bSig;
3129     } while ( 0 <= (sbits64) aSig );
3130     sigMean = aSig + alternateASig;
3131     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3132         aSig = alternateASig;
3133     }
3134     zSign = ( (sbits64) aSig < 0 );
3135     if ( zSign ) aSig = - aSig;
3136     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3137
3138 }
3139
3140 /*----------------------------------------------------------------------------
3141 | Returns the square root of the double-precision floating-point value `a'.
3142 | The operation is performed according to the IEC/IEEE Standard for Binary
3143 | Floating-Point Arithmetic.
3144 *----------------------------------------------------------------------------*/
3145
3146 float64 float64_sqrt( float64 a STATUS_PARAM )
3147 {
3148     flag aSign;
3149     int16 aExp, zExp;
3150     bits64 aSig, zSig, doubleZSig;
3151     bits64 rem0, rem1, term0, term1;
3152
3153     aSig = extractFloat64Frac( a );
3154     aExp = extractFloat64Exp( a );
3155     aSign = extractFloat64Sign( a );
3156     if ( aExp == 0x7FF ) {
3157         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3158         if ( ! aSign ) return a;
3159         float_raise( float_flag_invalid STATUS_VAR);
3160         return float64_default_nan;
3161     }
3162     if ( aSign ) {
3163         if ( ( aExp | aSig ) == 0 ) return a;
3164         float_raise( float_flag_invalid STATUS_VAR);
3165         return float64_default_nan;
3166     }
3167     if ( aExp == 0 ) {
3168         if ( aSig == 0 ) return float64_zero;
3169         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3170     }
3171     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3172     aSig |= LIT64( 0x0010000000000000 );
3173     zSig = estimateSqrt32( aExp, aSig>>21 );
3174     aSig <<= 9 - ( aExp & 1 );
3175     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3176     if ( ( zSig & 0x1FF ) <= 5 ) {
3177         doubleZSig = zSig<<1;
3178         mul64To128( zSig, zSig, &term0, &term1 );
3179         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3180         while ( (sbits64) rem0 < 0 ) {
3181             --zSig;
3182             doubleZSig -= 2;
3183             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3184         }
3185         zSig |= ( ( rem0 | rem1 ) != 0 );
3186     }
3187     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3188
3189 }
3190
3191 /*----------------------------------------------------------------------------
3192 | Returns the binary log of the double-precision floating-point value `a'.
3193 | The operation is performed according to the IEC/IEEE Standard for Binary
3194 | Floating-Point Arithmetic.
3195 *----------------------------------------------------------------------------*/
3196 float64 float64_log2( float64 a STATUS_PARAM )
3197 {
3198     flag aSign, zSign;
3199     int16 aExp;
3200     bits64 aSig, aSig0, aSig1, zSig, i;
3201
3202     aSig = extractFloat64Frac( a );
3203     aExp = extractFloat64Exp( a );
3204     aSign = extractFloat64Sign( a );
3205
3206     if ( aExp == 0 ) {
3207         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3208         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3209     }
3210     if ( aSign ) {
3211         float_raise( float_flag_invalid STATUS_VAR);
3212         return float64_default_nan;
3213     }
3214     if ( aExp == 0x7FF ) {
3215         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3216         return a;
3217     }
3218
3219     aExp -= 0x3FF;
3220     aSig |= LIT64( 0x0010000000000000 );
3221     zSign = aExp < 0;
3222     zSig = (bits64)aExp << 52;
3223     for (i = 1LL << 51; i > 0; i >>= 1) {
3224         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3225         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3226         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3227             aSig >>= 1;
3228             zSig |= i;
3229         }
3230     }
3231
3232     if ( zSign )
3233         zSig = -zSig;
3234     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3235 }
3236
3237 /*----------------------------------------------------------------------------
3238 | Returns 1 if the double-precision floating-point value `a' is equal to the
3239 | corresponding value `b', and 0 otherwise.  The comparison is performed
3240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3241 *----------------------------------------------------------------------------*/
3242
3243 int float64_eq( float64 a, float64 b STATUS_PARAM )
3244 {
3245     bits64 av, bv;
3246
3247     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3248          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3249        ) {
3250         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3251             float_raise( float_flag_invalid STATUS_VAR);
3252         }
3253         return 0;
3254     }
3255     av = float64_val(a);
3256     bv = float64_val(b);
3257     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3258
3259 }
3260
3261 /*----------------------------------------------------------------------------
3262 | Returns 1 if the double-precision floating-point value `a' is less than or
3263 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3265 | Arithmetic.
3266 *----------------------------------------------------------------------------*/
3267
3268 int float64_le( float64 a, float64 b STATUS_PARAM )
3269 {
3270     flag aSign, bSign;
3271     bits64 av, bv;
3272
3273     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3274          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3275        ) {
3276         float_raise( float_flag_invalid STATUS_VAR);
3277         return 0;
3278     }
3279     aSign = extractFloat64Sign( a );
3280     bSign = extractFloat64Sign( b );
3281     av = float64_val(a);
3282     bv = float64_val(b);
3283     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3284     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3285
3286 }
3287
3288 /*----------------------------------------------------------------------------
3289 | Returns 1 if the double-precision floating-point value `a' is less than
3290 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3291 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3292 *----------------------------------------------------------------------------*/
3293
3294 int float64_lt( float64 a, float64 b STATUS_PARAM )
3295 {
3296     flag aSign, bSign;
3297     bits64 av, bv;
3298
3299     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3300          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3301        ) {
3302         float_raise( float_flag_invalid STATUS_VAR);
3303         return 0;
3304     }
3305     aSign = extractFloat64Sign( a );
3306     bSign = extractFloat64Sign( b );
3307     av = float64_val(a);
3308     bv = float64_val(b);
3309     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3310     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3311
3312 }
3313
3314 /*----------------------------------------------------------------------------
3315 | Returns 1 if the double-precision floating-point value `a' is equal to the
3316 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3317 | if either operand is a NaN.  Otherwise, the comparison is performed
3318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3319 *----------------------------------------------------------------------------*/
3320
3321 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3322 {
3323     bits64 av, bv;
3324
3325     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3326          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3327        ) {
3328         float_raise( float_flag_invalid STATUS_VAR);
3329         return 0;
3330     }
3331     av = float64_val(a);
3332     bv = float64_val(b);
3333     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3334
3335 }
3336
3337 /*----------------------------------------------------------------------------
3338 | Returns 1 if the double-precision floating-point value `a' is less than or
3339 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3340 | cause an exception.  Otherwise, the comparison is performed according to the
3341 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3342 *----------------------------------------------------------------------------*/
3343
3344 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3345 {
3346     flag aSign, bSign;
3347     bits64 av, bv;
3348
3349     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3350          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3351        ) {
3352         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3353             float_raise( float_flag_invalid STATUS_VAR);
3354         }
3355         return 0;
3356     }
3357     aSign = extractFloat64Sign( a );
3358     bSign = extractFloat64Sign( b );
3359     av = float64_val(a);
3360     bv = float64_val(b);
3361     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3362     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3363
3364 }
3365
3366 /*----------------------------------------------------------------------------
3367 | Returns 1 if the double-precision floating-point value `a' is less than
3368 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3369 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3370 | Standard for Binary Floating-Point Arithmetic.
3371 *----------------------------------------------------------------------------*/
3372
3373 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3374 {
3375     flag aSign, bSign;
3376     bits64 av, bv;
3377
3378     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3379          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3380        ) {
3381         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3382             float_raise( float_flag_invalid STATUS_VAR);
3383         }
3384         return 0;
3385     }
3386     aSign = extractFloat64Sign( a );
3387     bSign = extractFloat64Sign( b );
3388     av = float64_val(a);
3389     bv = float64_val(b);
3390     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3391     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3392
3393 }
3394
3395 #ifdef FLOATX80
3396
3397 /*----------------------------------------------------------------------------
3398 | Returns the result of converting the extended double-precision floating-
3399 | point value `a' to the 32-bit two's complement integer format.  The
3400 | conversion is performed according to the IEC/IEEE Standard for Binary
3401 | Floating-Point Arithmetic---which means in particular that the conversion
3402 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3403 | largest positive integer is returned.  Otherwise, if the conversion
3404 | overflows, the largest integer with the same sign as `a' is returned.
3405 *----------------------------------------------------------------------------*/
3406
3407 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3408 {
3409     flag aSign;
3410     int32 aExp, shiftCount;
3411     bits64 aSig;
3412
3413     aSig = extractFloatx80Frac( a );
3414     aExp = extractFloatx80Exp( a );
3415     aSign = extractFloatx80Sign( a );
3416     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3417     shiftCount = 0x4037 - aExp;
3418     if ( shiftCount <= 0 ) shiftCount = 1;
3419     shift64RightJamming( aSig, shiftCount, &aSig );
3420     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3421
3422 }
3423
3424 /*----------------------------------------------------------------------------
3425 | Returns the result of converting the extended double-precision floating-
3426 | point value `a' to the 32-bit two's complement integer format.  The
3427 | conversion is performed according to the IEC/IEEE Standard for Binary
3428 | Floating-Point Arithmetic, except that the conversion is always rounded
3429 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3430 | Otherwise, if the conversion overflows, the largest integer with the same
3431 | sign as `a' is returned.
3432 *----------------------------------------------------------------------------*/
3433
3434 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3435 {
3436     flag aSign;
3437     int32 aExp, shiftCount;
3438     bits64 aSig, savedASig;
3439     int32 z;
3440
3441     aSig = extractFloatx80Frac( a );
3442     aExp = extractFloatx80Exp( a );
3443     aSign = extractFloatx80Sign( a );
3444     if ( 0x401E < aExp ) {
3445         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3446         goto invalid;
3447     }
3448     else if ( aExp < 0x3FFF ) {
3449         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3450         return 0;
3451     }
3452     shiftCount = 0x403E - aExp;
3453     savedASig = aSig;
3454     aSig >>= shiftCount;
3455     z = aSig;
3456     if ( aSign ) z = - z;
3457     if ( ( z < 0 ) ^ aSign ) {
3458  invalid:
3459         float_raise( float_flag_invalid STATUS_VAR);
3460         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3461     }
3462     if ( ( aSig<<shiftCount ) != savedASig ) {
3463         STATUS(float_exception_flags) |= float_flag_inexact;
3464     }
3465     return z;
3466
3467 }
3468
3469 /*----------------------------------------------------------------------------
3470 | Returns the result of converting the extended double-precision floating-
3471 | point value `a' to the 64-bit two's complement integer format.  The
3472 | conversion is performed according to the IEC/IEEE Standard for Binary
3473 | Floating-Point Arithmetic---which means in particular that the conversion
3474 | is rounded according to the current rounding mode.  If `a' is a NaN,
3475 | the largest positive integer is returned.  Otherwise, if the conversion
3476 | overflows, the largest integer with the same sign as `a' is returned.
3477 *----------------------------------------------------------------------------*/
3478
3479 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3480 {
3481     flag aSign;
3482     int32 aExp, shiftCount;
3483     bits64 aSig, aSigExtra;
3484
3485     aSig = extractFloatx80Frac( a );
3486     aExp = extractFloatx80Exp( a );
3487     aSign = extractFloatx80Sign( a );
3488     shiftCount = 0x403E - aExp;
3489     if ( shiftCount <= 0 ) {
3490         if ( shiftCount ) {
3491             float_raise( float_flag_invalid STATUS_VAR);
3492             if (    ! aSign
3493                  || (    ( aExp == 0x7FFF )
3494                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3495                ) {
3496                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3497             }
3498             return (sbits64) LIT64( 0x8000000000000000 );
3499         }
3500         aSigExtra = 0;
3501     }
3502     else {
3503         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3504     }
3505     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3506
3507 }
3508
3509 /*----------------------------------------------------------------------------
3510 | Returns the result of converting the extended double-precision floating-
3511 | point value `a' to the 64-bit two's complement integer format.  The
3512 | conversion is performed according to the IEC/IEEE Standard for Binary
3513 | Floating-Point Arithmetic, except that the conversion is always rounded
3514 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3515 | Otherwise, if the conversion overflows, the largest integer with the same
3516 | sign as `a' is returned.
3517 *----------------------------------------------------------------------------*/
3518
3519 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3520 {
3521     flag aSign;
3522     int32 aExp, shiftCount;
3523     bits64 aSig;
3524     int64 z;
3525
3526     aSig = extractFloatx80Frac( a );
3527     aExp = extractFloatx80Exp( a );
3528     aSign = extractFloatx80Sign( a );
3529     shiftCount = aExp - 0x403E;
3530     if ( 0 <= shiftCount ) {
3531         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3532         if ( ( a.high != 0xC03E ) || aSig ) {
3533             float_raise( float_flag_invalid STATUS_VAR);
3534             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3535                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3536             }
3537         }
3538         return (sbits64) LIT64( 0x8000000000000000 );
3539     }
3540     else if ( aExp < 0x3FFF ) {
3541         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3542         return 0;
3543     }
3544     z = aSig>>( - shiftCount );
3545     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3546         STATUS(float_exception_flags) |= float_flag_inexact;
3547     }
3548     if ( aSign ) z = - z;
3549     return z;
3550
3551 }
3552
3553 /*----------------------------------------------------------------------------
3554 | Returns the result of converting the extended double-precision floating-
3555 | point value `a' to the single-precision floating-point format.  The
3556 | conversion is performed according to the IEC/IEEE Standard for Binary
3557 | Floating-Point Arithmetic.
3558 *----------------------------------------------------------------------------*/
3559
3560 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3561 {
3562     flag aSign;
3563     int32 aExp;
3564     bits64 aSig;
3565
3566     aSig = extractFloatx80Frac( a );
3567     aExp = extractFloatx80Exp( a );
3568     aSign = extractFloatx80Sign( a );
3569     if ( aExp == 0x7FFF ) {
3570         if ( (bits64) ( aSig<<1 ) ) {
3571             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3572         }
3573         return packFloat32( aSign, 0xFF, 0 );
3574     }
3575     shift64RightJamming( aSig, 33, &aSig );
3576     if ( aExp || aSig ) aExp -= 0x3F81;
3577     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3578
3579 }
3580
3581 /*----------------------------------------------------------------------------
3582 | Returns the result of converting the extended double-precision floating-
3583 | point value `a' to the double-precision floating-point format.  The
3584 | conversion is performed according to the IEC/IEEE Standard for Binary
3585 | Floating-Point Arithmetic.
3586 *----------------------------------------------------------------------------*/
3587
3588 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3589 {
3590     flag aSign;
3591     int32 aExp;
3592     bits64 aSig, zSig;
3593
3594     aSig = extractFloatx80Frac( a );
3595     aExp = extractFloatx80Exp( a );
3596     aSign = extractFloatx80Sign( a );
3597     if ( aExp == 0x7FFF ) {
3598         if ( (bits64) ( aSig<<1 ) ) {
3599             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3600         }
3601         return packFloat64( aSign, 0x7FF, 0 );
3602     }
3603     shift64RightJamming( aSig, 1, &zSig );
3604     if ( aExp || aSig ) aExp -= 0x3C01;
3605     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3606
3607 }
3608
3609 #ifdef FLOAT128
3610
3611 /*----------------------------------------------------------------------------
3612 | Returns the result of converting the extended double-precision floating-
3613 | point value `a' to the quadruple-precision floating-point format.  The
3614 | conversion is performed according to the IEC/IEEE Standard for Binary
3615 | Floating-Point Arithmetic.
3616 *----------------------------------------------------------------------------*/
3617
3618 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3619 {
3620     flag aSign;
3621     int16 aExp;
3622     bits64 aSig, zSig0, zSig1;
3623
3624     aSig = extractFloatx80Frac( a );
3625     aExp = extractFloatx80Exp( a );
3626     aSign = extractFloatx80Sign( a );
3627     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3628         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3629     }
3630     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3631     return packFloat128( aSign, aExp, zSig0, zSig1 );
3632
3633 }
3634
3635 #endif
3636
3637 /*----------------------------------------------------------------------------
3638 | Rounds the extended double-precision floating-point value `a' to an integer,
3639 | and returns the result as an extended quadruple-precision floating-point
3640 | value.  The operation is performed according to the IEC/IEEE Standard for
3641 | Binary Floating-Point Arithmetic.
3642 *----------------------------------------------------------------------------*/
3643
3644 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3645 {
3646     flag aSign;
3647     int32 aExp;
3648     bits64 lastBitMask, roundBitsMask;
3649     int8 roundingMode;
3650     floatx80 z;
3651
3652     aExp = extractFloatx80Exp( a );
3653     if ( 0x403E <= aExp ) {
3654         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3655             return propagateFloatx80NaN( a, a STATUS_VAR );
3656         }
3657         return a;
3658     }
3659     if ( aExp < 0x3FFF ) {
3660         if (    ( aExp == 0 )
3661              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3662             return a;
3663         }
3664         STATUS(float_exception_flags) |= float_flag_inexact;
3665         aSign = extractFloatx80Sign( a );
3666         switch ( STATUS(float_rounding_mode) ) {
3667          case float_round_nearest_even:
3668             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3669                ) {
3670                 return
3671                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3672             }
3673             break;
3674          case float_round_down:
3675             return
3676                   aSign ?
3677                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3678                 : packFloatx80( 0, 0, 0 );
3679          case float_round_up:
3680             return
3681                   aSign ? packFloatx80( 1, 0, 0 )
3682                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3683         }
3684         return packFloatx80( aSign, 0, 0 );
3685     }
3686     lastBitMask = 1;
3687     lastBitMask <<= 0x403E - aExp;
3688     roundBitsMask = lastBitMask - 1;
3689     z = a;
3690     roundingMode = STATUS(float_rounding_mode);
3691     if ( roundingMode == float_round_nearest_even ) {
3692         z.low += lastBitMask>>1;
3693         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3694     }
3695     else if ( roundingMode != float_round_to_zero ) {
3696         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3697             z.low += roundBitsMask;
3698         }
3699     }
3700     z.low &= ~ roundBitsMask;
3701     if ( z.low == 0 ) {
3702         ++z.high;
3703         z.low = LIT64( 0x8000000000000000 );
3704     }
3705     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3706     return z;
3707
3708 }
3709
3710 /*----------------------------------------------------------------------------
3711 | Returns the result of adding the absolute values of the extended double-
3712 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3713 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3714 | The addition is performed according to the IEC/IEEE Standard for Binary
3715 | Floating-Point Arithmetic.
3716 *----------------------------------------------------------------------------*/
3717
3718 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3719 {
3720     int32 aExp, bExp, zExp;
3721     bits64 aSig, bSig, zSig0, zSig1;
3722     int32 expDiff;
3723
3724     aSig = extractFloatx80Frac( a );
3725     aExp = extractFloatx80Exp( a );
3726     bSig = extractFloatx80Frac( b );
3727     bExp = extractFloatx80Exp( b );
3728     expDiff = aExp - bExp;
3729     if ( 0 < expDiff ) {
3730         if ( aExp == 0x7FFF ) {
3731             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3732             return a;
3733         }
3734         if ( bExp == 0 ) --expDiff;
3735         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3736         zExp = aExp;
3737     }
3738     else if ( expDiff < 0 ) {
3739         if ( bExp == 0x7FFF ) {
3740             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3741             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3742         }
3743         if ( aExp == 0 ) ++expDiff;
3744         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3745         zExp = bExp;
3746     }
3747     else {
3748         if ( aExp == 0x7FFF ) {
3749             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3750                 return propagateFloatx80NaN( a, b STATUS_VAR );
3751             }
3752             return a;
3753         }
3754         zSig1 = 0;
3755         zSig0 = aSig + bSig;
3756         if ( aExp == 0 ) {
3757             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3758             goto roundAndPack;
3759         }
3760         zExp = aExp;
3761         goto shiftRight1;
3762     }
3763     zSig0 = aSig + bSig;
3764     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3765  shiftRight1:
3766     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3767     zSig0 |= LIT64( 0x8000000000000000 );
3768     ++zExp;
3769  roundAndPack:
3770     return
3771         roundAndPackFloatx80(
3772             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3773
3774 }
3775
3776 /*----------------------------------------------------------------------------
3777 | Returns the result of subtracting the absolute values of the extended
3778 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3779 | difference is negated before being returned.  `zSign' is ignored if the
3780 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3781 | Standard for Binary Floating-Point Arithmetic.
3782 *----------------------------------------------------------------------------*/
3783
3784 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3785 {
3786     int32 aExp, bExp, zExp;
3787     bits64 aSig, bSig, zSig0, zSig1;
3788     int32 expDiff;
3789     floatx80 z;
3790
3791     aSig = extractFloatx80Frac( a );
3792     aExp = extractFloatx80Exp( a );
3793     bSig = extractFloatx80Frac( b );
3794     bExp = extractFloatx80Exp( b );
3795     expDiff = aExp - bExp;
3796     if ( 0 < expDiff ) goto aExpBigger;
3797     if ( expDiff < 0 ) goto bExpBigger;
3798     if ( aExp == 0x7FFF ) {
3799         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3800             return propagateFloatx80NaN( a, b STATUS_VAR );
3801         }
3802         float_raise( float_flag_invalid STATUS_VAR);
3803         z.low = floatx80_default_nan_low;
3804         z.high = floatx80_default_nan_high;
3805         return z;
3806     }
3807     if ( aExp == 0 ) {
3808         aExp = 1;
3809         bExp = 1;
3810     }
3811     zSig1 = 0;
3812     if ( bSig < aSig ) goto aBigger;
3813     if ( aSig < bSig ) goto bBigger;
3814     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3815  bExpBigger:
3816     if ( bExp == 0x7FFF ) {
3817         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3818         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3819     }
3820     if ( aExp == 0 ) ++expDiff;
3821     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3822  bBigger:
3823     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3824     zExp = bExp;
3825     zSign ^= 1;
3826     goto normalizeRoundAndPack;
3827  aExpBigger:
3828     if ( aExp == 0x7FFF ) {
3829         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3830         return a;
3831     }
3832     if ( bExp == 0 ) --expDiff;
3833     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3834  aBigger:
3835     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3836     zExp = aExp;
3837  normalizeRoundAndPack:
3838     return
3839         normalizeRoundAndPackFloatx80(
3840             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3841
3842 }
3843
3844 /*----------------------------------------------------------------------------
3845 | Returns the result of adding the extended double-precision floating-point
3846 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3847 | Standard for Binary Floating-Point Arithmetic.
3848 *----------------------------------------------------------------------------*/
3849
3850 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3851 {
3852     flag aSign, bSign;
3853
3854     aSign = extractFloatx80Sign( a );
3855     bSign = extractFloatx80Sign( b );
3856     if ( aSign == bSign ) {
3857         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3858     }
3859     else {
3860         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3861     }
3862
3863 }
3864
3865 /*----------------------------------------------------------------------------
3866 | Returns the result of subtracting the extended double-precision floating-
3867 | point values `a' and `b'.  The operation is performed according to the
3868 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3869 *----------------------------------------------------------------------------*/
3870
3871 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3872 {
3873     flag aSign, bSign;
3874
3875     aSign = extractFloatx80Sign( a );
3876     bSign = extractFloatx80Sign( b );
3877     if ( aSign == bSign ) {
3878         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3879     }
3880     else {
3881         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3882     }
3883
3884 }
3885
3886 /*----------------------------------------------------------------------------
3887 | Returns the result of multiplying the extended double-precision floating-
3888 | point values `a' and `b'.  The operation is performed according to the
3889 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3890 *----------------------------------------------------------------------------*/
3891
3892 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3893 {
3894     flag aSign, bSign, zSign;
3895     int32 aExp, bExp, zExp;
3896     bits64 aSig, bSig, zSig0, zSig1;
3897     floatx80 z;
3898
3899     aSig = extractFloatx80Frac( a );
3900     aExp = extractFloatx80Exp( a );
3901     aSign = extractFloatx80Sign( a );
3902     bSig = extractFloatx80Frac( b );
3903     bExp = extractFloatx80Exp( b );
3904     bSign = extractFloatx80Sign( b );
3905     zSign = aSign ^ bSign;
3906     if ( aExp == 0x7FFF ) {
3907         if (    (bits64) ( aSig<<1 )
3908              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3909             return propagateFloatx80NaN( a, b STATUS_VAR );
3910         }
3911         if ( ( bExp | bSig ) == 0 ) goto invalid;
3912         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3913     }
3914     if ( bExp == 0x7FFF ) {
3915         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3916         if ( ( aExp | aSig ) == 0 ) {
3917  invalid:
3918             float_raise( float_flag_invalid STATUS_VAR);
3919             z.low = floatx80_default_nan_low;
3920             z.high = floatx80_default_nan_high;
3921             return z;
3922         }
3923         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3924     }
3925     if ( aExp == 0 ) {
3926         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3927         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3928     }
3929     if ( bExp == 0 ) {
3930         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3931         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3932     }
3933     zExp = aExp + bExp - 0x3FFE;
3934     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3935     if ( 0 < (sbits64) zSig0 ) {
3936         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3937         --zExp;
3938     }
3939     return
3940         roundAndPackFloatx80(
3941             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3942
3943 }
3944
3945 /*----------------------------------------------------------------------------
3946 | Returns the result of dividing the extended double-precision floating-point
3947 | value `a' by the corresponding value `b'.  The operation is performed
3948 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3949 *----------------------------------------------------------------------------*/
3950
3951 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3952 {
3953     flag aSign, bSign, zSign;
3954     int32 aExp, bExp, zExp;
3955     bits64 aSig, bSig, zSig0, zSig1;
3956     bits64 rem0, rem1, rem2, term0, term1, term2;
3957     floatx80 z;
3958
3959     aSig = extractFloatx80Frac( a );
3960     aExp = extractFloatx80Exp( a );
3961     aSign = extractFloatx80Sign( a );
3962     bSig = extractFloatx80Frac( b );
3963     bExp = extractFloatx80Exp( b );
3964     bSign = extractFloatx80Sign( b );
3965     zSign = aSign ^ bSign;
3966     if ( aExp == 0x7FFF ) {
3967         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3968         if ( bExp == 0x7FFF ) {
3969             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3970             goto invalid;
3971         }
3972         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3973     }
3974     if ( bExp == 0x7FFF ) {
3975         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3976         return packFloatx80( zSign, 0, 0 );
3977     }
3978     if ( bExp == 0 ) {
3979         if ( bSig == 0 ) {
3980             if ( ( aExp | aSig ) == 0 ) {
3981  invalid:
3982                 float_raise( float_flag_invalid STATUS_VAR);
3983                 z.low = floatx80_default_nan_low;
3984                 z.high = floatx80_default_nan_high;
3985                 return z;
3986             }
3987             float_raise( float_flag_divbyzero STATUS_VAR);
3988             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3989         }
3990         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3991     }
3992     if ( aExp == 0 ) {
3993         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3994         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3995     }
3996     zExp = aExp - bExp + 0x3FFE;
3997     rem1 = 0;
3998     if ( bSig <= aSig ) {
3999         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4000         ++zExp;
4001     }
4002     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4003     mul64To128( bSig, zSig0, &term0, &term1 );
4004     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4005     while ( (sbits64) rem0 < 0 ) {
4006         --zSig0;
4007         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4008     }
4009     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4010     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4011         mul64To128( bSig, zSig1, &term1, &term2 );
4012         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4013         while ( (sbits64) rem1 < 0 ) {
4014             --zSig1;
4015             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4016         }
4017         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4018     }
4019     return
4020         roundAndPackFloatx80(
4021             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4022
4023 }
4024
4025 /*----------------------------------------------------------------------------
4026 | Returns the remainder of the extended double-precision floating-point value
4027 | `a' with respect to the corresponding value `b'.  The operation is performed
4028 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4029 *----------------------------------------------------------------------------*/
4030
4031 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4032 {
4033     flag aSign, zSign;
4034     int32 aExp, bExp, expDiff;
4035     bits64 aSig0, aSig1, bSig;
4036     bits64 q, term0, term1, alternateASig0, alternateASig1;
4037     floatx80 z;
4038
4039     aSig0 = extractFloatx80Frac( a );
4040     aExp = extractFloatx80Exp( a );
4041     aSign = extractFloatx80Sign( a );
4042     bSig = extractFloatx80Frac( b );
4043     bExp = extractFloatx80Exp( b );
4044     if ( aExp == 0x7FFF ) {
4045         if (    (bits64) ( aSig0<<1 )
4046              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4047             return propagateFloatx80NaN( a, b STATUS_VAR );
4048         }
4049         goto invalid;
4050     }
4051     if ( bExp == 0x7FFF ) {
4052         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4053         return a;
4054     }
4055     if ( bExp == 0 ) {
4056         if ( bSig == 0 ) {
4057  invalid:
4058             float_raise( float_flag_invalid STATUS_VAR);
4059             z.low = floatx80_default_nan_low;
4060             z.high = floatx80_default_nan_high;
4061             return z;
4062         }
4063         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4064     }
4065     if ( aExp == 0 ) {
4066         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4067         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4068     }
4069     bSig |= LIT64( 0x8000000000000000 );
4070     zSign = aSign;
4071     expDiff = aExp - bExp;
4072     aSig1 = 0;
4073     if ( expDiff < 0 ) {
4074         if ( expDiff < -1 ) return a;
4075         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4076         expDiff = 0;
4077     }
4078     q = ( bSig <= aSig0 );
4079     if ( q ) aSig0 -= bSig;
4080     expDiff -= 64;
4081     while ( 0 < expDiff ) {
4082         q = estimateDiv128To64( aSig0, aSig1, bSig );
4083         q = ( 2 < q ) ? q - 2 : 0;
4084         mul64To128( bSig, q, &term0, &term1 );
4085         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4086         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4087         expDiff -= 62;
4088     }
4089     expDiff += 64;
4090     if ( 0 < expDiff ) {
4091         q = estimateDiv128To64( aSig0, aSig1, bSig );
4092         q = ( 2 < q ) ? q - 2 : 0;
4093         q >>= 64 - expDiff;
4094         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4095         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4096         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4097         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4098             ++q;
4099             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4100         }
4101     }
4102     else {
4103         term1 = 0;
4104         term0 = bSig;
4105     }
4106     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4107     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4108          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4109               && ( q & 1 ) )
4110        ) {
4111         aSig0 = alternateASig0;
4112         aSig1 = alternateASig1;
4113         zSign = ! zSign;
4114     }
4115     return
4116         normalizeRoundAndPackFloatx80(
4117             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4118
4119 }
4120
4121 /*----------------------------------------------------------------------------
4122 | Returns the square root of the extended double-precision floating-point
4123 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4124 | for Binary Floating-Point Arithmetic.
4125 *----------------------------------------------------------------------------*/
4126
4127 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4128 {
4129     flag aSign;
4130     int32 aExp, zExp;
4131     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4132     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4133     floatx80 z;
4134
4135     aSig0 = extractFloatx80Frac( a );
4136     aExp = extractFloatx80Exp( a );
4137     aSign = extractFloatx80Sign( a );
4138     if ( aExp == 0x7FFF ) {
4139         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4140         if ( ! aSign ) return a;
4141         goto invalid;
4142     }
4143     if ( aSign ) {
4144         if ( ( aExp | aSig0 ) == 0 ) return a;
4145  invalid:
4146         float_raise( float_flag_invalid STATUS_VAR);
4147         z.low = floatx80_default_nan_low;
4148         z.high = floatx80_default_nan_high;
4149         return z;
4150     }
4151     if ( aExp == 0 ) {
4152         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4153         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4154     }
4155     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4156     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4157     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4158     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4159     doubleZSig0 = zSig0<<1;
4160     mul64To128( zSig0, zSig0, &term0, &term1 );
4161     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4162     while ( (sbits64) rem0 < 0 ) {
4163         --zSig0;
4164         doubleZSig0 -= 2;
4165         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4166     }
4167     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4168     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4169         if ( zSig1 == 0 ) zSig1 = 1;
4170         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4171         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4172         mul64To128( zSig1, zSig1, &term2, &term3 );
4173         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4174         while ( (sbits64) rem1 < 0 ) {
4175             --zSig1;
4176             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4177             term3 |= 1;
4178             term2 |= doubleZSig0;
4179             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4180         }
4181         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4182     }
4183     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4184     zSig0 |= doubleZSig0;
4185     return
4186         roundAndPackFloatx80(
4187             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4188
4189 }
4190
4191 /*----------------------------------------------------------------------------
4192 | Returns 1 if the extended double-precision floating-point value `a' is
4193 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4194 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4195 | Arithmetic.
4196 *----------------------------------------------------------------------------*/
4197
4198 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4199 {
4200
4201     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4202               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4203          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4204               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4205        ) {
4206         if (    floatx80_is_signaling_nan( a )
4207              || floatx80_is_signaling_nan( b ) ) {
4208             float_raise( float_flag_invalid STATUS_VAR);
4209         }
4210         return 0;
4211     }
4212     return
4213            ( a.low == b.low )
4214         && (    ( a.high == b.high )
4215              || (    ( a.low == 0 )
4216                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4217            );
4218
4219 }
4220
4221 /*----------------------------------------------------------------------------
4222 | Returns 1 if the extended double-precision floating-point value `a' is
4223 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4224 | comparison is performed according to the IEC/IEEE Standard for Binary
4225 | Floating-Point Arithmetic.
4226 *----------------------------------------------------------------------------*/
4227
4228 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4229 {
4230     flag aSign, bSign;
4231
4232     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4233               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4234          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4235               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4236        ) {
4237         float_raise( float_flag_invalid STATUS_VAR);
4238         return 0;
4239     }
4240     aSign = extractFloatx80Sign( a );
4241     bSign = extractFloatx80Sign( b );
4242     if ( aSign != bSign ) {
4243         return
4244                aSign
4245             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4246                  == 0 );
4247     }
4248     return
4249           aSign ? le128( b.high, b.low, a.high, a.low )
4250         : le128( a.high, a.low, b.high, b.low );
4251
4252 }
4253
4254 /*----------------------------------------------------------------------------
4255 | Returns 1 if the extended double-precision floating-point value `a' is
4256 | less than the corresponding value `b', and 0 otherwise.  The comparison
4257 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4258 | Arithmetic.
4259 *----------------------------------------------------------------------------*/
4260
4261 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4262 {
4263     flag aSign, bSign;
4264
4265     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4266               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4267          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4268               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4269        ) {
4270         float_raise( float_flag_invalid STATUS_VAR);
4271         return 0;
4272     }
4273     aSign = extractFloatx80Sign( a );
4274     bSign = extractFloatx80Sign( b );
4275     if ( aSign != bSign ) {
4276         return
4277                aSign
4278             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4279                  != 0 );
4280     }
4281     return
4282           aSign ? lt128( b.high, b.low, a.high, a.low )
4283         : lt128( a.high, a.low, b.high, b.low );
4284
4285 }
4286
4287 /*----------------------------------------------------------------------------
4288 | Returns 1 if the extended double-precision floating-point value `a' is equal
4289 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4290 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4291 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4292 *----------------------------------------------------------------------------*/
4293
4294 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4295 {
4296
4297     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4298               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4299          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4300               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4301        ) {
4302         float_raise( float_flag_invalid STATUS_VAR);
4303         return 0;
4304     }
4305     return
4306            ( a.low == b.low )
4307         && (    ( a.high == b.high )
4308              || (    ( a.low == 0 )
4309                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4310            );
4311
4312 }
4313
4314 /*----------------------------------------------------------------------------
4315 | Returns 1 if the extended double-precision floating-point value `a' is less
4316 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4317 | do not cause an exception.  Otherwise, the comparison is performed according
4318 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4319 *----------------------------------------------------------------------------*/
4320
4321 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4322 {
4323     flag aSign, bSign;
4324
4325     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4326               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4327          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4328               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4329        ) {
4330         if (    floatx80_is_signaling_nan( a )
4331              || floatx80_is_signaling_nan( b ) ) {
4332             float_raise( float_flag_invalid STATUS_VAR);
4333         }
4334         return 0;
4335     }
4336     aSign = extractFloatx80Sign( a );
4337     bSign = extractFloatx80Sign( b );
4338     if ( aSign != bSign ) {
4339         return
4340                aSign
4341             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4342                  == 0 );
4343     }
4344     return
4345           aSign ? le128( b.high, b.low, a.high, a.low )
4346         : le128( a.high, a.low, b.high, b.low );
4347
4348 }
4349
4350 /*----------------------------------------------------------------------------
4351 | Returns 1 if the extended double-precision floating-point value `a' is less
4352 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4353 | an exception.  Otherwise, the comparison is performed according to the
4354 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4355 *----------------------------------------------------------------------------*/
4356
4357 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4358 {
4359     flag aSign, bSign;
4360
4361     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4362               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4363          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4364               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4365        ) {
4366         if (    floatx80_is_signaling_nan( a )
4367              || floatx80_is_signaling_nan( b ) ) {
4368             float_raise( float_flag_invalid STATUS_VAR);
4369         }
4370         return 0;
4371     }
4372     aSign = extractFloatx80Sign( a );
4373     bSign = extractFloatx80Sign( b );
4374     if ( aSign != bSign ) {
4375         return
4376                aSign
4377             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4378                  != 0 );
4379     }
4380     return
4381           aSign ? lt128( b.high, b.low, a.high, a.low )
4382         : lt128( a.high, a.low, b.high, b.low );
4383
4384 }
4385
4386 #endif
4387
4388 #ifdef FLOAT128
4389
4390 /*----------------------------------------------------------------------------
4391 | Returns the result of converting the quadruple-precision floating-point
4392 | value `a' to the 32-bit two's complement integer format.  The conversion
4393 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4394 | Arithmetic---which means in particular that the conversion is rounded
4395 | according to the current rounding mode.  If `a' is a NaN, the largest
4396 | positive integer is returned.  Otherwise, if the conversion overflows, the
4397 | largest integer with the same sign as `a' is returned.
4398 *----------------------------------------------------------------------------*/
4399
4400 int32 float128_to_int32( float128 a STATUS_PARAM )
4401 {
4402     flag aSign;
4403     int32 aExp, shiftCount;
4404     bits64 aSig0, aSig1;
4405
4406     aSig1 = extractFloat128Frac1( a );
4407     aSig0 = extractFloat128Frac0( a );
4408     aExp = extractFloat128Exp( a );
4409     aSign = extractFloat128Sign( a );
4410     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4411     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4412     aSig0 |= ( aSig1 != 0 );
4413     shiftCount = 0x4028 - aExp;
4414     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4415     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4416
4417 }
4418
4419 /*----------------------------------------------------------------------------
4420 | Returns the result of converting the quadruple-precision floating-point
4421 | value `a' to the 32-bit two's complement integer format.  The conversion
4422 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4423 | Arithmetic, except that the conversion is always rounded toward zero.  If
4424 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4425 | conversion overflows, the largest integer with the same sign as `a' is
4426 | returned.
4427 *----------------------------------------------------------------------------*/
4428
4429 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4430 {
4431     flag aSign;
4432     int32 aExp, shiftCount;
4433     bits64 aSig0, aSig1, savedASig;
4434     int32 z;
4435
4436     aSig1 = extractFloat128Frac1( a );
4437     aSig0 = extractFloat128Frac0( a );
4438     aExp = extractFloat128Exp( a );
4439     aSign = extractFloat128Sign( a );
4440     aSig0 |= ( aSig1 != 0 );
4441     if ( 0x401E < aExp ) {
4442         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4443         goto invalid;
4444     }
4445     else if ( aExp < 0x3FFF ) {
4446         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4447         return 0;
4448     }
4449     aSig0 |= LIT64( 0x0001000000000000 );
4450     shiftCount = 0x402F - aExp;
4451     savedASig = aSig0;
4452     aSig0 >>= shiftCount;
4453     z = aSig0;
4454     if ( aSign ) z = - z;
4455     if ( ( z < 0 ) ^ aSign ) {
4456  invalid:
4457         float_raise( float_flag_invalid STATUS_VAR);
4458         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4459     }
4460     if ( ( aSig0<<shiftCount ) != savedASig ) {
4461         STATUS(float_exception_flags) |= float_flag_inexact;
4462     }
4463     return z;
4464
4465 }
4466
4467 /*----------------------------------------------------------------------------
4468 | Returns the result of converting the quadruple-precision floating-point
4469 | value `a' to the 64-bit two's complement integer format.  The conversion
4470 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4471 | Arithmetic---which means in particular that the conversion is rounded
4472 | according to the current rounding mode.  If `a' is a NaN, the largest
4473 | positive integer is returned.  Otherwise, if the conversion overflows, the
4474 | largest integer with the same sign as `a' is returned.
4475 *----------------------------------------------------------------------------*/
4476
4477 int64 float128_to_int64( float128 a STATUS_PARAM )
4478 {
4479     flag aSign;
4480     int32 aExp, shiftCount;
4481     bits64 aSig0, aSig1;
4482
4483     aSig1 = extractFloat128Frac1( a );
4484     aSig0 = extractFloat128Frac0( a );
4485     aExp = extractFloat128Exp( a );
4486     aSign = extractFloat128Sign( a );
4487     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4488     shiftCount = 0x402F - aExp;
4489     if ( shiftCount <= 0 ) {
4490         if ( 0x403E < aExp ) {
4491             float_raise( float_flag_invalid STATUS_VAR);
4492             if (    ! aSign
4493                  || (    ( aExp == 0x7FFF )
4494                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4495                     )
4496                ) {
4497                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4498             }
4499             return (sbits64) LIT64( 0x8000000000000000 );
4500         }
4501         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4502     }
4503     else {
4504         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4505     }
4506     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4507
4508 }
4509
4510 /*----------------------------------------------------------------------------
4511 | Returns the result of converting the quadruple-precision floating-point
4512 | value `a' to the 64-bit two's complement integer format.  The conversion
4513 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4514 | Arithmetic, except that the conversion is always rounded toward zero.
4515 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4516 | the conversion overflows, the largest integer with the same sign as `a' is
4517 | returned.
4518 *----------------------------------------------------------------------------*/
4519
4520 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4521 {
4522     flag aSign;
4523     int32 aExp, shiftCount;
4524     bits64 aSig0, aSig1;
4525     int64 z;
4526
4527     aSig1 = extractFloat128Frac1( a );
4528     aSig0 = extractFloat128Frac0( a );
4529     aExp = extractFloat128Exp( a );
4530     aSign = extractFloat128Sign( a );
4531     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4532     shiftCount = aExp - 0x402F;
4533     if ( 0 < shiftCount ) {
4534         if ( 0x403E <= aExp ) {
4535             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4536             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4537                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4538                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4539             }
4540             else {
4541                 float_raise( float_flag_invalid STATUS_VAR);
4542                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4543                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4544                 }
4545             }
4546             return (sbits64) LIT64( 0x8000000000000000 );
4547         }
4548         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4549         if ( (bits64) ( aSig1<<shiftCount ) ) {
4550             STATUS(float_exception_flags) |= float_flag_inexact;
4551         }
4552     }
4553     else {
4554         if ( aExp < 0x3FFF ) {
4555             if ( aExp | aSig0 | aSig1 ) {
4556                 STATUS(float_exception_flags) |= float_flag_inexact;
4557             }
4558             return 0;
4559         }
4560         z = aSig0>>( - shiftCount );
4561         if (    aSig1
4562              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4563             STATUS(float_exception_flags) |= float_flag_inexact;
4564         }
4565     }
4566     if ( aSign ) z = - z;
4567     return z;
4568
4569 }
4570
4571 /*----------------------------------------------------------------------------
4572 | Returns the result of converting the quadruple-precision floating-point
4573 | value `a' to the single-precision floating-point format.  The conversion
4574 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4575 | Arithmetic.
4576 *----------------------------------------------------------------------------*/
4577
4578 float32 float128_to_float32( float128 a STATUS_PARAM )
4579 {
4580     flag aSign;
4581     int32 aExp;
4582     bits64 aSig0, aSig1;
4583     bits32 zSig;
4584
4585     aSig1 = extractFloat128Frac1( a );
4586     aSig0 = extractFloat128Frac0( a );
4587     aExp = extractFloat128Exp( a );
4588     aSign = extractFloat128Sign( a );
4589     if ( aExp == 0x7FFF ) {
4590         if ( aSig0 | aSig1 ) {
4591             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4592         }
4593         return packFloat32( aSign, 0xFF, 0 );
4594     }
4595     aSig0 |= ( aSig1 != 0 );
4596     shift64RightJamming( aSig0, 18, &aSig0 );
4597     zSig = aSig0;
4598     if ( aExp || zSig ) {
4599         zSig |= 0x40000000;
4600         aExp -= 0x3F81;
4601     }
4602     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4603
4604 }
4605
4606 /*----------------------------------------------------------------------------
4607 | Returns the result of converting the quadruple-precision floating-point
4608 | value `a' to the double-precision floating-point format.  The conversion
4609 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4610 | Arithmetic.
4611 *----------------------------------------------------------------------------*/
4612
4613 float64 float128_to_float64( float128 a STATUS_PARAM )
4614 {
4615     flag aSign;
4616     int32 aExp;
4617     bits64 aSig0, aSig1;
4618
4619     aSig1 = extractFloat128Frac1( a );
4620     aSig0 = extractFloat128Frac0( a );
4621     aExp = extractFloat128Exp( a );
4622     aSign = extractFloat128Sign( a );
4623     if ( aExp == 0x7FFF ) {
4624         if ( aSig0 | aSig1 ) {
4625             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4626         }
4627         return packFloat64( aSign, 0x7FF, 0 );
4628     }
4629     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4630     aSig0 |= ( aSig1 != 0 );
4631     if ( aExp || aSig0 ) {
4632         aSig0 |= LIT64( 0x4000000000000000 );
4633         aExp -= 0x3C01;
4634     }
4635     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4636
4637 }
4638
4639 #ifdef FLOATX80
4640
4641 /*----------------------------------------------------------------------------
4642 | Returns the result of converting the quadruple-precision floating-point
4643 | value `a' to the extended double-precision floating-point format.  The
4644 | conversion is performed according to the IEC/IEEE Standard for Binary
4645 | Floating-Point Arithmetic.
4646 *----------------------------------------------------------------------------*/
4647
4648 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4649 {
4650     flag aSign;
4651     int32 aExp;
4652     bits64 aSig0, aSig1;
4653
4654     aSig1 = extractFloat128Frac1( a );
4655     aSig0 = extractFloat128Frac0( a );
4656     aExp = extractFloat128Exp( a );
4657     aSign = extractFloat128Sign( a );
4658     if ( aExp == 0x7FFF ) {
4659         if ( aSig0 | aSig1 ) {
4660             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4661         }
4662         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4663     }
4664     if ( aExp == 0 ) {
4665         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4666         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4667     }
4668     else {
4669         aSig0 |= LIT64( 0x0001000000000000 );
4670     }
4671     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4672     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4673
4674 }
4675
4676 #endif
4677
4678 /*----------------------------------------------------------------------------
4679 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4680 | returns the result as a quadruple-precision floating-point value.  The
4681 | operation is performed according to the IEC/IEEE Standard for Binary
4682 | Floating-Point Arithmetic.
4683 *----------------------------------------------------------------------------*/
4684
4685 float128 float128_round_to_int( float128 a STATUS_PARAM )
4686 {
4687     flag aSign;
4688     int32 aExp;
4689     bits64 lastBitMask, roundBitsMask;
4690     int8 roundingMode;
4691     float128 z;
4692
4693     aExp = extractFloat128Exp( a );
4694     if ( 0x402F <= aExp ) {
4695         if ( 0x406F <= aExp ) {
4696             if (    ( aExp == 0x7FFF )
4697                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4698                ) {
4699                 return propagateFloat128NaN( a, a STATUS_VAR );
4700             }
4701             return a;
4702         }
4703         lastBitMask = 1;
4704         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4705         roundBitsMask = lastBitMask - 1;
4706         z = a;
4707         roundingMode = STATUS(float_rounding_mode);
4708         if ( roundingMode == float_round_nearest_even ) {
4709             if ( lastBitMask ) {
4710                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4711                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4712             }
4713             else {
4714                 if ( (sbits64) z.low < 0 ) {
4715                     ++z.high;
4716                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4717                 }
4718             }
4719         }
4720         else if ( roundingMode != float_round_to_zero ) {
4721             if (   extractFloat128Sign( z )
4722                  ^ ( roundingMode == float_round_up ) ) {
4723                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4724             }
4725         }
4726         z.low &= ~ roundBitsMask;
4727     }
4728     else {
4729         if ( aExp < 0x3FFF ) {
4730             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4731             STATUS(float_exception_flags) |= float_flag_inexact;
4732             aSign = extractFloat128Sign( a );
4733             switch ( STATUS(float_rounding_mode) ) {
4734              case float_round_nearest_even:
4735                 if (    ( aExp == 0x3FFE )
4736                      && (   extractFloat128Frac0( a )
4737                           | extractFloat128Frac1( a ) )
4738                    ) {
4739                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4740                 }
4741                 break;
4742              case float_round_down:
4743                 return
4744                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4745                     : packFloat128( 0, 0, 0, 0 );
4746              case float_round_up:
4747                 return
4748                       aSign ? packFloat128( 1, 0, 0, 0 )
4749                     : packFloat128( 0, 0x3FFF, 0, 0 );
4750             }
4751             return packFloat128( aSign, 0, 0, 0 );
4752         }
4753         lastBitMask = 1;
4754         lastBitMask <<= 0x402F - aExp;
4755         roundBitsMask = lastBitMask - 1;
4756         z.low = 0;
4757         z.high = a.high;
4758         roundingMode = STATUS(float_rounding_mode);
4759         if ( roundingMode == float_round_nearest_even ) {
4760             z.high += lastBitMask>>1;
4761             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4762                 z.high &= ~ lastBitMask;
4763             }
4764         }
4765         else if ( roundingMode != float_round_to_zero ) {
4766             if (   extractFloat128Sign( z )
4767                  ^ ( roundingMode == float_round_up ) ) {
4768                 z.high |= ( a.low != 0 );
4769                 z.high += roundBitsMask;
4770             }
4771         }
4772         z.high &= ~ roundBitsMask;
4773     }
4774     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4775         STATUS(float_exception_flags) |= float_flag_inexact;
4776     }
4777     return z;
4778
4779 }
4780
4781 /*----------------------------------------------------------------------------
4782 | Returns the result of adding the absolute values of the quadruple-precision
4783 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4784 | before being returned.  `zSign' is ignored if the result is a NaN.
4785 | The addition is performed according to the IEC/IEEE Standard for Binary
4786 | Floating-Point Arithmetic.
4787 *----------------------------------------------------------------------------*/
4788
4789 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4790 {
4791     int32 aExp, bExp, zExp;
4792     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4793     int32 expDiff;
4794
4795     aSig1 = extractFloat128Frac1( a );
4796     aSig0 = extractFloat128Frac0( a );
4797     aExp = extractFloat128Exp( a );
4798     bSig1 = extractFloat128Frac1( b );
4799     bSig0 = extractFloat128Frac0( b );
4800     bExp = extractFloat128Exp( b );
4801     expDiff = aExp - bExp;
4802     if ( 0 < expDiff ) {
4803         if ( aExp == 0x7FFF ) {
4804             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4805             return a;
4806         }
4807         if ( bExp == 0 ) {
4808             --expDiff;
4809         }
4810         else {
4811             bSig0 |= LIT64( 0x0001000000000000 );
4812         }
4813         shift128ExtraRightJamming(
4814             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4815         zExp = aExp;
4816     }
4817     else if ( expDiff < 0 ) {
4818         if ( bExp == 0x7FFF ) {
4819             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4820             return packFloat128( zSign, 0x7FFF, 0, 0 );
4821         }
4822         if ( aExp == 0 ) {
4823             ++expDiff;
4824         }
4825         else {
4826             aSig0 |= LIT64( 0x0001000000000000 );
4827         }
4828         shift128ExtraRightJamming(
4829             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4830         zExp = bExp;
4831     }
4832     else {
4833         if ( aExp == 0x7FFF ) {
4834             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4835                 return propagateFloat128NaN( a, b STATUS_VAR );
4836             }
4837             return a;
4838         }
4839         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4840         if ( aExp == 0 ) {
4841             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4842             return packFloat128( zSign, 0, zSig0, zSig1 );
4843         }
4844         zSig2 = 0;
4845         zSig0 |= LIT64( 0x0002000000000000 );
4846         zExp = aExp;
4847         goto shiftRight1;
4848     }
4849     aSig0 |= LIT64( 0x0001000000000000 );
4850     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4851     --zExp;
4852     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4853     ++zExp;
4854  shiftRight1:
4855     shift128ExtraRightJamming(
4856         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4857  roundAndPack:
4858     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4859
4860 }
4861
4862 /*----------------------------------------------------------------------------
4863 | Returns the result of subtracting the absolute values of the quadruple-
4864 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4865 | difference is negated before being returned.  `zSign' is ignored if the
4866 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4867 | Standard for Binary Floating-Point Arithmetic.
4868 *----------------------------------------------------------------------------*/
4869
4870 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4871 {
4872     int32 aExp, bExp, zExp;
4873     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4874     int32 expDiff;
4875     float128 z;
4876
4877     aSig1 = extractFloat128Frac1( a );
4878     aSig0 = extractFloat128Frac0( a );
4879     aExp = extractFloat128Exp( a );
4880     bSig1 = extractFloat128Frac1( b );
4881     bSig0 = extractFloat128Frac0( b );
4882     bExp = extractFloat128Exp( b );
4883     expDiff = aExp - bExp;
4884     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4885     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4886     if ( 0 < expDiff ) goto aExpBigger;
4887     if ( expDiff < 0 ) goto bExpBigger;
4888     if ( aExp == 0x7FFF ) {
4889         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4890             return propagateFloat128NaN( a, b STATUS_VAR );
4891         }
4892         float_raise( float_flag_invalid STATUS_VAR);
4893         z.low = float128_default_nan_low;
4894         z.high = float128_default_nan_high;
4895         return z;
4896     }
4897     if ( aExp == 0 ) {
4898         aExp = 1;
4899         bExp = 1;
4900     }
4901     if ( bSig0 < aSig0 ) goto aBigger;
4902     if ( aSig0 < bSig0 ) goto bBigger;
4903     if ( bSig1 < aSig1 ) goto aBigger;
4904     if ( aSig1 < bSig1 ) goto bBigger;
4905     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4906  bExpBigger:
4907     if ( bExp == 0x7FFF ) {
4908         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4909         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4910     }
4911     if ( aExp == 0 ) {
4912         ++expDiff;
4913     }
4914     else {
4915         aSig0 |= LIT64( 0x4000000000000000 );
4916     }
4917     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4918     bSig0 |= LIT64( 0x4000000000000000 );
4919  bBigger:
4920     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4921     zExp = bExp;
4922     zSign ^= 1;
4923     goto normalizeRoundAndPack;
4924  aExpBigger:
4925     if ( aExp == 0x7FFF ) {
4926         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4927         return a;
4928     }
4929     if ( bExp == 0 ) {
4930         --expDiff;
4931     }
4932     else {
4933         bSig0 |= LIT64( 0x4000000000000000 );
4934     }
4935     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4936     aSig0 |= LIT64( 0x4000000000000000 );
4937  aBigger:
4938     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4939     zExp = aExp;
4940  normalizeRoundAndPack:
4941     --zExp;
4942     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4943
4944 }
4945
4946 /*----------------------------------------------------------------------------
4947 | Returns the result of adding the quadruple-precision floating-point values
4948 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4949 | for Binary Floating-Point Arithmetic.
4950 *----------------------------------------------------------------------------*/
4951
4952 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4953 {
4954     flag aSign, bSign;
4955
4956     aSign = extractFloat128Sign( a );
4957     bSign = extractFloat128Sign( b );
4958     if ( aSign == bSign ) {
4959         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4960     }
4961     else {
4962         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4963     }
4964
4965 }
4966
4967 /*----------------------------------------------------------------------------
4968 | Returns the result of subtracting the quadruple-precision floating-point
4969 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4970 | Standard for Binary Floating-Point Arithmetic.
4971 *----------------------------------------------------------------------------*/
4972
4973 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4974 {
4975     flag aSign, bSign;
4976
4977     aSign = extractFloat128Sign( a );
4978     bSign = extractFloat128Sign( b );
4979     if ( aSign == bSign ) {
4980         return subFloat128Sigs( a, b, aSign STATUS_VAR );
4981     }
4982     else {
4983         return addFloat128Sigs( a, b, aSign STATUS_VAR );
4984     }
4985
4986 }
4987
4988 /*----------------------------------------------------------------------------
4989 | Returns the result of multiplying the quadruple-precision floating-point
4990 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4991 | Standard for Binary Floating-Point Arithmetic.
4992 *----------------------------------------------------------------------------*/
4993
4994 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4995 {
4996     flag aSign, bSign, zSign;
4997     int32 aExp, bExp, zExp;
4998     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4999     float128 z;
5000
5001     aSig1 = extractFloat128Frac1( a );
5002     aSig0 = extractFloat128Frac0( a );
5003     aExp = extractFloat128Exp( a );
5004     aSign = extractFloat128Sign( a );
5005     bSig1 = extractFloat128Frac1( b );
5006     bSig0 = extractFloat128Frac0( b );
5007     bExp = extractFloat128Exp( b );
5008     bSign = extractFloat128Sign( b );
5009     zSign = aSign ^ bSign;
5010     if ( aExp == 0x7FFF ) {
5011         if (    ( aSig0 | aSig1 )
5012              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5013             return propagateFloat128NaN( a, b STATUS_VAR );
5014         }
5015         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5016         return packFloat128( zSign, 0x7FFF, 0, 0 );
5017     }
5018     if ( bExp == 0x7FFF ) {
5019         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5020         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5021  invalid:
5022             float_raise( float_flag_invalid STATUS_VAR);
5023             z.low = float128_default_nan_low;
5024             z.high = float128_default_nan_high;
5025             return z;
5026         }
5027         return packFloat128( zSign, 0x7FFF, 0, 0 );
5028     }
5029     if ( aExp == 0 ) {
5030         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5031         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5032     }
5033     if ( bExp == 0 ) {
5034         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5035         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5036     }
5037     zExp = aExp + bExp - 0x4000;
5038     aSig0 |= LIT64( 0x0001000000000000 );
5039     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5040     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5041     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5042     zSig2 |= ( zSig3 != 0 );
5043     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5044         shift128ExtraRightJamming(
5045             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5046         ++zExp;
5047     }
5048     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5049
5050 }
5051
5052 /*----------------------------------------------------------------------------
5053 | Returns the result of dividing the quadruple-precision floating-point value
5054 | `a' by the corresponding value `b'.  The operation is performed according to
5055 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5056 *----------------------------------------------------------------------------*/
5057
5058 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5059 {
5060     flag aSign, bSign, zSign;
5061     int32 aExp, bExp, zExp;
5062     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5063     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5064     float128 z;
5065
5066     aSig1 = extractFloat128Frac1( a );
5067     aSig0 = extractFloat128Frac0( a );
5068     aExp = extractFloat128Exp( a );
5069     aSign = extractFloat128Sign( a );
5070     bSig1 = extractFloat128Frac1( b );
5071     bSig0 = extractFloat128Frac0( b );
5072     bExp = extractFloat128Exp( b );
5073     bSign = extractFloat128Sign( b );
5074     zSign = aSign ^ bSign;
5075     if ( aExp == 0x7FFF ) {
5076         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5077         if ( bExp == 0x7FFF ) {
5078             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5079             goto invalid;
5080         }
5081         return packFloat128( zSign, 0x7FFF, 0, 0 );
5082     }
5083     if ( bExp == 0x7FFF ) {
5084         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5085         return packFloat128( zSign, 0, 0, 0 );
5086     }
5087     if ( bExp == 0 ) {
5088         if ( ( bSig0 | bSig1 ) == 0 ) {
5089             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5090  invalid:
5091                 float_raise( float_flag_invalid STATUS_VAR);
5092                 z.low = float128_default_nan_low;
5093                 z.high = float128_default_nan_high;
5094                 return z;
5095             }
5096             float_raise( float_flag_divbyzero STATUS_VAR);
5097             return packFloat128( zSign, 0x7FFF, 0, 0 );
5098         }
5099         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5100     }
5101     if ( aExp == 0 ) {
5102         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5103         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5104     }
5105     zExp = aExp - bExp + 0x3FFD;
5106     shortShift128Left(
5107         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5108     shortShift128Left(
5109         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5110     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5111         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5112         ++zExp;
5113     }
5114     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5115     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5116     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5117     while ( (sbits64) rem0 < 0 ) {
5118         --zSig0;
5119         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5120     }
5121     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5122     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5123         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5124         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5125         while ( (sbits64) rem1 < 0 ) {
5126             --zSig1;
5127             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5128         }
5129         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5130     }
5131     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5132     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5133
5134 }
5135
5136 /*----------------------------------------------------------------------------
5137 | Returns the remainder of the quadruple-precision floating-point value `a'
5138 | with respect to the corresponding value `b'.  The operation is performed
5139 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5140 *----------------------------------------------------------------------------*/
5141
5142 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5143 {
5144     flag aSign, zSign;
5145     int32 aExp, bExp, expDiff;
5146     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5147     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5148     sbits64 sigMean0;
5149     float128 z;
5150
5151     aSig1 = extractFloat128Frac1( a );
5152     aSig0 = extractFloat128Frac0( a );
5153     aExp = extractFloat128Exp( a );
5154     aSign = extractFloat128Sign( a );
5155     bSig1 = extractFloat128Frac1( b );
5156     bSig0 = extractFloat128Frac0( b );
5157     bExp = extractFloat128Exp( b );
5158     if ( aExp == 0x7FFF ) {
5159         if (    ( aSig0 | aSig1 )
5160              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5161             return propagateFloat128NaN( a, b STATUS_VAR );
5162         }
5163         goto invalid;
5164     }
5165     if ( bExp == 0x7FFF ) {
5166         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5167         return a;
5168     }
5169     if ( bExp == 0 ) {
5170         if ( ( bSig0 | bSig1 ) == 0 ) {
5171  invalid:
5172             float_raise( float_flag_invalid STATUS_VAR);
5173             z.low = float128_default_nan_low;
5174             z.high = float128_default_nan_high;
5175             return z;
5176         }
5177         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5178     }
5179     if ( aExp == 0 ) {
5180         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5181         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5182     }
5183     expDiff = aExp - bExp;
5184     if ( expDiff < -1 ) return a;
5185     shortShift128Left(
5186         aSig0 | LIT64( 0x0001000000000000 ),
5187         aSig1,
5188         15 - ( expDiff < 0 ),
5189         &aSig0,
5190         &aSig1
5191     );
5192     shortShift128Left(
5193         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5194     q = le128( bSig0, bSig1, aSig0, aSig1 );
5195     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5196     expDiff -= 64;
5197     while ( 0 < expDiff ) {
5198         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5199         q = ( 4 < q ) ? q - 4 : 0;
5200         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5201         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5202         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5203         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5204         expDiff -= 61;
5205     }
5206     if ( -64 < expDiff ) {
5207         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5208         q = ( 4 < q ) ? q - 4 : 0;
5209         q >>= - expDiff;
5210         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5211         expDiff += 52;
5212         if ( expDiff < 0 ) {
5213             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5214         }
5215         else {
5216             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5217         }
5218         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5219         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5220     }
5221     else {
5222         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5223         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5224     }
5225     do {
5226         alternateASig0 = aSig0;
5227         alternateASig1 = aSig1;
5228         ++q;
5229         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5230     } while ( 0 <= (sbits64) aSig0 );
5231     add128(
5232         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5233     if (    ( sigMean0 < 0 )
5234          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5235         aSig0 = alternateASig0;
5236         aSig1 = alternateASig1;
5237     }
5238     zSign = ( (sbits64) aSig0 < 0 );
5239     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5240     return
5241         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5242
5243 }
5244
5245 /*----------------------------------------------------------------------------
5246 | Returns the square root of the quadruple-precision floating-point value `a'.
5247 | The operation is performed according to the IEC/IEEE Standard for Binary
5248 | Floating-Point Arithmetic.
5249 *----------------------------------------------------------------------------*/
5250
5251 float128 float128_sqrt( float128 a STATUS_PARAM )
5252 {
5253     flag aSign;
5254     int32 aExp, zExp;
5255     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5256     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5257     float128 z;
5258
5259     aSig1 = extractFloat128Frac1( a );
5260     aSig0 = extractFloat128Frac0( a );
5261     aExp = extractFloat128Exp( a );
5262     aSign = extractFloat128Sign( a );
5263     if ( aExp == 0x7FFF ) {
5264         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5265         if ( ! aSign ) return a;
5266         goto invalid;
5267     }
5268     if ( aSign ) {
5269         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5270  invalid:
5271         float_raise( float_flag_invalid STATUS_VAR);
5272         z.low = float128_default_nan_low;
5273         z.high = float128_default_nan_high;
5274         return z;
5275     }
5276     if ( aExp == 0 ) {
5277         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5278         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5279     }
5280     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5281     aSig0 |= LIT64( 0x0001000000000000 );
5282     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5283     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5284     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5285     doubleZSig0 = zSig0<<1;
5286     mul64To128( zSig0, zSig0, &term0, &term1 );
5287     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5288     while ( (sbits64) rem0 < 0 ) {
5289         --zSig0;
5290         doubleZSig0 -= 2;
5291         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5292     }
5293     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5294     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5295         if ( zSig1 == 0 ) zSig1 = 1;
5296         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5297         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5298         mul64To128( zSig1, zSig1, &term2, &term3 );
5299         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5300         while ( (sbits64) rem1 < 0 ) {
5301             --zSig1;
5302             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5303             term3 |= 1;
5304             term2 |= doubleZSig0;
5305             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5306         }
5307         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5308     }
5309     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5310     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5311
5312 }
5313
5314 /*----------------------------------------------------------------------------
5315 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5316 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5317 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5318 *----------------------------------------------------------------------------*/
5319
5320 int float128_eq( float128 a, float128 b STATUS_PARAM )
5321 {
5322
5323     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5324               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5325          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5326               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5327        ) {
5328         if (    float128_is_signaling_nan( a )
5329              || float128_is_signaling_nan( b ) ) {
5330             float_raise( float_flag_invalid STATUS_VAR);
5331         }
5332         return 0;
5333     }
5334     return
5335            ( a.low == b.low )
5336         && (    ( a.high == b.high )
5337              || (    ( a.low == 0 )
5338                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5339            );
5340
5341 }
5342
5343 /*----------------------------------------------------------------------------
5344 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5345 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5346 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5347 | Arithmetic.
5348 *----------------------------------------------------------------------------*/
5349
5350 int float128_le( float128 a, float128 b STATUS_PARAM )
5351 {
5352     flag aSign, bSign;
5353
5354     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5355               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5356          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5357               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5358        ) {
5359         float_raise( float_flag_invalid STATUS_VAR);
5360         return 0;
5361     }
5362     aSign = extractFloat128Sign( a );
5363     bSign = extractFloat128Sign( b );
5364     if ( aSign != bSign ) {
5365         return
5366                aSign
5367             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5368                  == 0 );
5369     }
5370     return
5371           aSign ? le128( b.high, b.low, a.high, a.low )
5372         : le128( a.high, a.low, b.high, b.low );
5373
5374 }
5375
5376 /*----------------------------------------------------------------------------
5377 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5378 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5379 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5380 *----------------------------------------------------------------------------*/
5381
5382 int float128_lt( float128 a, float128 b STATUS_PARAM )
5383 {
5384     flag aSign, bSign;
5385
5386     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5387               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5388          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5389               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5390        ) {
5391         float_raise( float_flag_invalid STATUS_VAR);
5392         return 0;
5393     }
5394     aSign = extractFloat128Sign( a );
5395     bSign = extractFloat128Sign( b );
5396     if ( aSign != bSign ) {
5397         return
5398                aSign
5399             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5400                  != 0 );
5401     }
5402     return
5403           aSign ? lt128( b.high, b.low, a.high, a.low )
5404         : lt128( a.high, a.low, b.high, b.low );
5405
5406 }
5407
5408 /*----------------------------------------------------------------------------
5409 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5410 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5411 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5413 *----------------------------------------------------------------------------*/
5414
5415 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5416 {
5417
5418     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5419               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5420          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5421               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5422        ) {
5423         float_raise( float_flag_invalid STATUS_VAR);
5424         return 0;
5425     }
5426     return
5427            ( a.low == b.low )
5428         && (    ( a.high == b.high )
5429              || (    ( a.low == 0 )
5430                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5431            );
5432
5433 }
5434
5435 /*----------------------------------------------------------------------------
5436 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5437 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5438 | cause an exception.  Otherwise, the comparison is performed according to the
5439 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5440 *----------------------------------------------------------------------------*/
5441
5442 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5443 {
5444     flag aSign, bSign;
5445
5446     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5447               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5448          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5449               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5450        ) {
5451         if (    float128_is_signaling_nan( a )
5452              || float128_is_signaling_nan( b ) ) {
5453             float_raise( float_flag_invalid STATUS_VAR);
5454         }
5455         return 0;
5456     }
5457     aSign = extractFloat128Sign( a );
5458     bSign = extractFloat128Sign( b );
5459     if ( aSign != bSign ) {
5460         return
5461                aSign
5462             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5463                  == 0 );
5464     }
5465     return
5466           aSign ? le128( b.high, b.low, a.high, a.low )
5467         : le128( a.high, a.low, b.high, b.low );
5468
5469 }
5470
5471 /*----------------------------------------------------------------------------
5472 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5473 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5474 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5475 | Standard for Binary Floating-Point Arithmetic.
5476 *----------------------------------------------------------------------------*/
5477
5478 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5479 {
5480     flag aSign, bSign;
5481
5482     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5483               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5484          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5485               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5486        ) {
5487         if (    float128_is_signaling_nan( a )
5488              || float128_is_signaling_nan( b ) ) {
5489             float_raise( float_flag_invalid STATUS_VAR);
5490         }
5491         return 0;
5492     }
5493     aSign = extractFloat128Sign( a );
5494     bSign = extractFloat128Sign( b );
5495     if ( aSign != bSign ) {
5496         return
5497                aSign
5498             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5499                  != 0 );
5500     }
5501     return
5502           aSign ? lt128( b.high, b.low, a.high, a.low )
5503         : lt128( a.high, a.low, b.high, b.low );
5504
5505 }
5506
5507 #endif
5508
5509 /* misc functions */
5510 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5511 {
5512     return int64_to_float32(a STATUS_VAR);
5513 }
5514
5515 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5516 {
5517     return int64_to_float64(a STATUS_VAR);
5518 }
5519
5520 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5521 {
5522     int64_t v;
5523     unsigned int res;
5524
5525     v = float32_to_int64(a STATUS_VAR);
5526     if (v < 0) {
5527         res = 0;
5528         float_raise( float_flag_invalid STATUS_VAR);
5529     } else if (v > 0xffffffff) {
5530         res = 0xffffffff;
5531         float_raise( float_flag_invalid STATUS_VAR);
5532     } else {
5533         res = v;
5534     }
5535     return res;
5536 }
5537
5538 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5539 {
5540     int64_t v;
5541     unsigned int res;
5542
5543     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5544     if (v < 0) {
5545         res = 0;
5546         float_raise( float_flag_invalid STATUS_VAR);
5547     } else if (v > 0xffffffff) {
5548         res = 0xffffffff;
5549         float_raise( float_flag_invalid STATUS_VAR);
5550     } else {
5551         res = v;
5552     }
5553     return res;
5554 }
5555
5556 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5557 {
5558     int64_t v;
5559     unsigned int res;
5560
5561     v = float64_to_int64(a STATUS_VAR);
5562     if (v < 0) {
5563         res = 0;
5564         float_raise( float_flag_invalid STATUS_VAR);
5565     } else if (v > 0xffffffff) {
5566         res = 0xffffffff;
5567         float_raise( float_flag_invalid STATUS_VAR);
5568     } else {
5569         res = v;
5570     }
5571     return res;
5572 }
5573
5574 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5575 {
5576     int64_t v;
5577     unsigned int res;
5578
5579     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5580     if (v < 0) {
5581         res = 0;
5582         float_raise( float_flag_invalid STATUS_VAR);
5583     } else if (v > 0xffffffff) {
5584         res = 0xffffffff;
5585         float_raise( float_flag_invalid STATUS_VAR);
5586     } else {
5587         res = v;
5588     }
5589     return res;
5590 }
5591
5592 /* FIXME: This looks broken.  */
5593 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5594 {
5595     int64_t v;
5596
5597     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5598     v += float64_val(a);
5599     v = float64_to_int64(make_float64(v) STATUS_VAR);
5600
5601     return v - INT64_MIN;
5602 }
5603
5604 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5605 {
5606     int64_t v;
5607
5608     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5609     v += float64_val(a);
5610     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5611
5612     return v - INT64_MIN;
5613 }
5614
5615 #define COMPARE(s, nan_exp)                                                  \
5616 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5617                                       int is_quiet STATUS_PARAM )            \
5618 {                                                                            \
5619     flag aSign, bSign;                                                       \
5620     bits ## s av, bv;                                                        \
5621                                                                              \
5622     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5623          extractFloat ## s ## Frac( a ) ) ||                                 \
5624         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5625           extractFloat ## s ## Frac( b ) )) {                                \
5626         if (!is_quiet ||                                                     \
5627             float ## s ## _is_signaling_nan( a ) ||                          \
5628             float ## s ## _is_signaling_nan( b ) ) {                         \
5629             float_raise( float_flag_invalid STATUS_VAR);                     \
5630         }                                                                    \
5631         return float_relation_unordered;                                     \
5632     }                                                                        \
5633     aSign = extractFloat ## s ## Sign( a );                                  \
5634     bSign = extractFloat ## s ## Sign( b );                                  \
5635     av = float ## s ## _val(a);                                              \
5636     bv = float ## s ## _val(b);                                              \
5637     if ( aSign != bSign ) {                                                  \
5638         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5639             /* zero case */                                                  \
5640             return float_relation_equal;                                     \
5641         } else {                                                             \
5642             return 1 - (2 * aSign);                                          \
5643         }                                                                    \
5644     } else {                                                                 \
5645         if (av == bv) {                                                      \
5646             return float_relation_equal;                                     \
5647         } else {                                                             \
5648             return 1 - 2 * (aSign ^ ( av < bv ));                            \
5649         }                                                                    \
5650     }                                                                        \
5651 }                                                                            \
5652                                                                              \
5653 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5654 {                                                                            \
5655     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5656 }                                                                            \
5657                                                                              \
5658 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5659 {                                                                            \
5660     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5661 }
5662
5663 COMPARE(32, 0xff)
5664 COMPARE(64, 0x7ff)
5665
5666 INLINE int float128_compare_internal( float128 a, float128 b,
5667                                       int is_quiet STATUS_PARAM )
5668 {
5669     flag aSign, bSign;
5670
5671     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5672           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5673         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5674           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5675         if (!is_quiet ||
5676             float128_is_signaling_nan( a ) ||
5677             float128_is_signaling_nan( b ) ) {
5678             float_raise( float_flag_invalid STATUS_VAR);
5679         }
5680         return float_relation_unordered;
5681     }
5682     aSign = extractFloat128Sign( a );
5683     bSign = extractFloat128Sign( b );
5684     if ( aSign != bSign ) {
5685         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5686             /* zero case */
5687             return float_relation_equal;
5688         } else {
5689             return 1 - (2 * aSign);
5690         }
5691     } else {
5692         if (a.low == b.low && a.high == b.high) {
5693             return float_relation_equal;
5694         } else {
5695             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5696         }
5697     }
5698 }
5699
5700 int float128_compare( float128 a, float128 b STATUS_PARAM )
5701 {
5702     return float128_compare_internal(a, b, 0 STATUS_VAR);
5703 }
5704
5705 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5706 {
5707     return float128_compare_internal(a, b, 1 STATUS_VAR);
5708 }
5709
5710 /* Multiply A by 2 raised to the power N.  */
5711 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5712 {
5713     flag aSign;
5714     int16 aExp;
5715     bits32 aSig;
5716
5717     aSig = extractFloat32Frac( a );
5718     aExp = extractFloat32Exp( a );
5719     aSign = extractFloat32Sign( a );
5720
5721     if ( aExp == 0xFF ) {
5722         return a;
5723     }
5724     if ( aExp != 0 )
5725         aSig |= 0x00800000;
5726     else if ( aSig == 0 )
5727         return a;
5728
5729     aExp += n - 1;
5730     aSig <<= 7;
5731     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5732 }
5733
5734 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5735 {
5736     flag aSign;
5737     int16 aExp;
5738     bits64 aSig;
5739
5740     aSig = extractFloat64Frac( a );
5741     aExp = extractFloat64Exp( a );
5742     aSign = extractFloat64Sign( a );
5743
5744     if ( aExp == 0x7FF ) {
5745         return a;
5746     }
5747     if ( aExp != 0 )
5748         aSig |= LIT64( 0x0010000000000000 );
5749     else if ( aSig == 0 )
5750         return a;
5751
5752     aExp += n - 1;
5753     aSig <<= 10;
5754     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5755 }
5756
5757 #ifdef FLOATX80
5758 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5759 {
5760     flag aSign;
5761     int16 aExp;
5762     bits64 aSig;
5763
5764     aSig = extractFloatx80Frac( a );
5765     aExp = extractFloatx80Exp( a );
5766     aSign = extractFloatx80Sign( a );
5767
5768     if ( aExp == 0x7FF ) {
5769         return a;
5770     }
5771     if (aExp == 0 && aSig == 0)
5772         return a;
5773
5774     aExp += n;
5775     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5776                                           aSign, aExp, aSig, 0 STATUS_VAR );
5777 }
5778 #endif
5779
5780 #ifdef FLOAT128
5781 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5782 {
5783     flag aSign;
5784     int32 aExp;
5785     bits64 aSig0, aSig1;
5786
5787     aSig1 = extractFloat128Frac1( a );
5788     aSig0 = extractFloat128Frac0( a );
5789     aExp = extractFloat128Exp( a );
5790     aSign = extractFloat128Sign( a );
5791     if ( aExp == 0x7FFF ) {
5792         return a;
5793     }
5794     if ( aExp != 0 )
5795         aSig0 |= LIT64( 0x0001000000000000 );
5796     else if ( aSig0 == 0 && aSig1 == 0 )
5797         return a;
5798
5799     aExp += n - 1;
5800     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5801                                           STATUS_VAR );
5802
5803 }
5804 #endif