You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
414 lines
9.3 KiB
414 lines
9.3 KiB
//===----------------------Hexagon builtin routine ------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Double Precision Multiply
|
|
#define A r1:0
|
|
#define AH r1
|
|
#define AL r0
|
|
#define B r3:2
|
|
#define BH r3
|
|
#define BL r2
|
|
|
|
#define BTMP r5:4
|
|
#define BTMPH r5
|
|
#define BTMPL r4
|
|
|
|
#define PP_ODD r7:6
|
|
#define PP_ODD_H r7
|
|
#define PP_ODD_L r6
|
|
|
|
#define ONE r9:8
|
|
#define S_ONE r8
|
|
#define S_ZERO r9
|
|
|
|
#define PP_HH r11:10
|
|
#define PP_HH_H r11
|
|
#define PP_HH_L r10
|
|
|
|
#define ATMP r13:12
|
|
#define ATMPH r13
|
|
#define ATMPL r12
|
|
|
|
#define PP_LL r15:14
|
|
#define PP_LL_H r15
|
|
#define PP_LL_L r14
|
|
|
|
#define TMP r28
|
|
|
|
#define MANTBITS 52
|
|
#define HI_MANTBITS 20
|
|
#define EXPBITS 11
|
|
#define BIAS 1024
|
|
#define MANTISSA_TO_INT_BIAS 52
|
|
|
|
// Some constant to adjust normalization amount in error code
|
|
// Amount to right shift the partial product to get to a denorm
|
|
#define FUDGE 5
|
|
|
|
#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
|
|
#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
|
|
#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
|
|
#define END(TAG) .size TAG,.-TAG
|
|
|
|
#define SR_ROUND_OFF 22
|
|
.text
|
|
.global __hexagon_muldf3
|
|
.type __hexagon_muldf3,@function
|
|
Q6_ALIAS(muldf3)
|
|
FAST_ALIAS(muldf3)
|
|
FAST2_ALIAS(muldf3)
|
|
.p2align 5
|
|
__hexagon_muldf3:
|
|
{
|
|
p0 = dfclass(A,#2)
|
|
p0 = dfclass(B,#2)
|
|
ATMP = combine(##0x40000000,#0)
|
|
}
|
|
{
|
|
ATMP = insert(A,#MANTBITS,#EXPBITS-1)
|
|
BTMP = asl(B,#EXPBITS-1)
|
|
TMP = #-BIAS
|
|
ONE = #1
|
|
}
|
|
{
|
|
PP_ODD = mpyu(BTMPL,ATMPH)
|
|
BTMP = insert(ONE,#2,#62)
|
|
}
|
|
// since we know that the MSB of the H registers is zero, we should never carry
|
|
// H <= 2^31-1. L <= 2^32-1. Therefore, HL <= 2^63-2^32-2^31+1
|
|
// Adding 2 HLs, we get 2^64-3*2^32+2 maximum.
|
|
// Therefore, we can add 3 2^32-1 values safely without carry. We only need one.
|
|
{
|
|
PP_LL = mpyu(ATMPL,BTMPL)
|
|
PP_ODD += mpyu(ATMPL,BTMPH)
|
|
}
|
|
{
|
|
PP_ODD += lsr(PP_LL,#32)
|
|
PP_HH = mpyu(ATMPH,BTMPH)
|
|
BTMP = combine(##BIAS+BIAS-4,#0)
|
|
}
|
|
{
|
|
PP_HH += lsr(PP_ODD,#32)
|
|
if (!p0) jump .Lmul_abnormal
|
|
p1 = cmp.eq(PP_LL_L,#0) // 64 lsb's 0?
|
|
p1 = cmp.eq(PP_ODD_L,#0) // 64 lsb's 0?
|
|
}
|
|
|
|
// PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
|
|
// PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
|
|
|
|
#undef PP_ODD
|
|
#undef PP_ODD_H
|
|
#undef PP_ODD_L
|
|
#define EXP10 r7:6
|
|
#define EXP1 r7
|
|
#define EXP0 r6
|
|
{
|
|
if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
|
|
EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
|
|
EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
|
|
}
|
|
{
|
|
PP_LL = neg(PP_HH)
|
|
EXP0 += add(TMP,EXP1)
|
|
TMP = xor(AH,BH)
|
|
}
|
|
{
|
|
if (!p2.new) PP_HH = PP_LL
|
|
p2 = cmp.gt(TMP,#-1)
|
|
p0 = !cmp.gt(EXP0,BTMPH)
|
|
p0 = cmp.gt(EXP0,BTMPL)
|
|
if (!p0.new) jump:nt .Lmul_ovf_unf
|
|
}
|
|
{
|
|
A = convert_d2df(PP_HH)
|
|
EXP0 = add(EXP0,#-BIAS-58)
|
|
}
|
|
{
|
|
AH += asl(EXP0,#HI_MANTBITS)
|
|
jumpr r31
|
|
}
|
|
|
|
.falign
|
|
.Lpossible_unf:
|
|
// We end up with a positive exponent
|
|
// But we may have rounded up to an exponent of 1.
|
|
// If the exponent is 1, if we rounded up to it
|
|
// we need to also raise underflow
|
|
// Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
|
|
// And the PP should also have more than one bit set
|
|
//
|
|
// Note: ATMP should have abs(PP_HH)
|
|
// Note: BTMPL should have 0x7FEFFFFF
|
|
{
|
|
p0 = cmp.eq(AL,#0)
|
|
p0 = bitsclr(AH,BTMPL)
|
|
if (!p0.new) jumpr:t r31
|
|
BTMPH = #0x7fff
|
|
}
|
|
{
|
|
p0 = bitsset(ATMPH,BTMPH)
|
|
BTMPL = USR
|
|
BTMPH = #0x030
|
|
}
|
|
{
|
|
if (p0) BTMPL = or(BTMPL,BTMPH)
|
|
}
|
|
{
|
|
USR = BTMPL
|
|
}
|
|
{
|
|
p0 = dfcmp.eq(A,A)
|
|
jumpr r31
|
|
}
|
|
.falign
|
|
.Lmul_ovf_unf:
|
|
{
|
|
A = convert_d2df(PP_HH)
|
|
ATMP = abs(PP_HH) // take absolute value
|
|
EXP1 = add(EXP0,#-BIAS-58)
|
|
}
|
|
{
|
|
AH += asl(EXP1,#HI_MANTBITS)
|
|
EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
|
|
BTMPL = ##0x7FEFFFFF
|
|
}
|
|
{
|
|
EXP1 += add(EXP0,##-BIAS-58)
|
|
//BTMPH = add(clb(ATMP),#-2)
|
|
BTMPH = #0
|
|
}
|
|
{
|
|
p0 = cmp.gt(EXP1,##BIAS+BIAS-2) // overflow
|
|
if (p0.new) jump:nt .Lmul_ovf
|
|
}
|
|
{
|
|
p0 = cmp.gt(EXP1,#0)
|
|
if (p0.new) jump:nt .Lpossible_unf
|
|
BTMPH = sub(EXP0,BTMPH)
|
|
TMP = #63 // max amount to shift
|
|
}
|
|
// Underflow
|
|
//
|
|
// PP_HH has the partial product with sticky LSB.
|
|
// PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
|
|
// PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
|
|
// The exponent of PP_HH is in EXP1, which is non-positive (0 or negative)
|
|
// That's the exponent that happens after the normalization
|
|
//
|
|
// EXP0 has the exponent that, when added to the normalized value, is out of range.
|
|
//
|
|
// Strategy:
|
|
//
|
|
// * Shift down bits, with sticky bit, such that the bits are aligned according
|
|
// to the LZ count and appropriate exponent, but not all the way to mantissa
|
|
// field, keep around the last few bits.
|
|
// * Put a 1 near the MSB
|
|
// * Check the LSBs for inexact; if inexact also set underflow
|
|
// * Convert [u]d2df -- will correctly round according to rounding mode
|
|
// * Replace exponent field with zero
|
|
|
|
{
|
|
BTMPL = #0 // offset for extract
|
|
BTMPH = sub(#FUDGE,BTMPH) // amount to right shift
|
|
}
|
|
{
|
|
p3 = cmp.gt(PP_HH_H,#-1) // is it positive?
|
|
BTMPH = min(BTMPH,TMP) // Don't shift more than 63
|
|
PP_HH = ATMP
|
|
}
|
|
{
|
|
TMP = USR
|
|
PP_LL = extractu(PP_HH,BTMP)
|
|
}
|
|
{
|
|
PP_HH = asr(PP_HH,BTMPH)
|
|
BTMPL = #0x0030 // underflow flag
|
|
AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
|
|
}
|
|
{
|
|
p0 = cmp.gtu(ONE,PP_LL) // Did we extract all zeros?
|
|
if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE) // add sticky bit
|
|
PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3) // Add back in a bit so we can use convert instruction
|
|
}
|
|
{
|
|
PP_LL = neg(PP_HH)
|
|
p1 = bitsclr(PP_HH_L,#0x7) // Are the LSB's clear?
|
|
if (!p1.new) TMP = or(BTMPL,TMP) // If not, Inexact+Underflow
|
|
}
|
|
{
|
|
if (!p3) PP_HH = PP_LL
|
|
USR = TMP
|
|
}
|
|
{
|
|
A = convert_d2df(PP_HH) // Do rounding
|
|
p0 = dfcmp.eq(A,A) // realize exception
|
|
}
|
|
{
|
|
AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1) // Insert correct exponent
|
|
jumpr r31
|
|
}
|
|
.falign
|
|
.Lmul_ovf:
|
|
// We get either max finite value or infinity. Either way, overflow+inexact
|
|
{
|
|
TMP = USR
|
|
ATMP = combine(##0x7fefffff,#-1) // positive max finite
|
|
A = PP_HH
|
|
}
|
|
{
|
|
PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF) // rounding bits
|
|
TMP = or(TMP,#0x28) // inexact + overflow
|
|
BTMP = combine(##0x7ff00000,#0) // positive infinity
|
|
}
|
|
{
|
|
USR = TMP
|
|
PP_LL_L ^= lsr(AH,#31) // Does sign match rounding?
|
|
TMP = PP_LL_L // unmodified rounding mode
|
|
}
|
|
{
|
|
p0 = !cmp.eq(TMP,#1) // If not round-to-zero and
|
|
p0 = !cmp.eq(PP_LL_L,#2) // Not rounding the other way,
|
|
if (p0.new) ATMP = BTMP // we should get infinity
|
|
p0 = dfcmp.eq(A,A) // Realize FP exception if enabled
|
|
}
|
|
{
|
|
A = insert(ATMP,#63,#0) // insert inf/maxfinite, leave sign
|
|
jumpr r31
|
|
}
|
|
|
|
.Lmul_abnormal:
|
|
{
|
|
ATMP = extractu(A,#63,#0) // strip off sign
|
|
BTMP = extractu(B,#63,#0) // strip off sign
|
|
}
|
|
{
|
|
p3 = cmp.gtu(ATMP,BTMP)
|
|
if (!p3.new) A = B // sort values
|
|
if (!p3.new) B = A // sort values
|
|
}
|
|
{
|
|
// Any NaN --> NaN, possibly raise invalid if sNaN
|
|
p0 = dfclass(A,#0x0f) // A not NaN?
|
|
if (!p0.new) jump:nt .Linvalid_nan
|
|
if (!p3) ATMP = BTMP
|
|
if (!p3) BTMP = ATMP
|
|
}
|
|
{
|
|
// Infinity * nonzero number is infinity
|
|
p1 = dfclass(A,#0x08) // A is infinity
|
|
p1 = dfclass(B,#0x0e) // B is nonzero
|
|
}
|
|
{
|
|
// Infinity * zero --> NaN, raise invalid
|
|
// Other zeros return zero
|
|
p0 = dfclass(A,#0x08) // A is infinity
|
|
p0 = dfclass(B,#0x01) // B is zero
|
|
}
|
|
{
|
|
if (p1) jump .Ltrue_inf
|
|
p2 = dfclass(B,#0x01)
|
|
}
|
|
{
|
|
if (p0) jump .Linvalid_zeroinf
|
|
if (p2) jump .Ltrue_zero // so return zero
|
|
TMP = ##0x7c000000
|
|
}
|
|
// We are left with a normal or subnormal times a subnormal. A > B
|
|
// If A and B are both very small (exp(a) < BIAS-MANTBITS),
|
|
// we go to a single sticky bit, which we can round easily.
|
|
// If A and B might multiply to something bigger, decrease A exponent and increase
|
|
// B exponent and try again
|
|
{
|
|
p0 = bitsclr(AH,TMP)
|
|
if (p0.new) jump:nt .Lmul_tiny
|
|
}
|
|
{
|
|
TMP = cl0(BTMP)
|
|
}
|
|
{
|
|
TMP = add(TMP,#-EXPBITS)
|
|
}
|
|
{
|
|
BTMP = asl(BTMP,TMP)
|
|
}
|
|
{
|
|
B = insert(BTMP,#63,#0)
|
|
AH -= asl(TMP,#HI_MANTBITS)
|
|
}
|
|
jump __hexagon_muldf3
|
|
.Lmul_tiny:
|
|
{
|
|
TMP = USR
|
|
A = xor(A,B) // get sign bit
|
|
}
|
|
{
|
|
TMP = or(TMP,#0x30) // Inexact + Underflow
|
|
A = insert(ONE,#63,#0) // put in rounded up value
|
|
BTMPH = extractu(TMP,#2,#SR_ROUND_OFF) // get rounding mode
|
|
}
|
|
{
|
|
USR = TMP
|
|
p0 = cmp.gt(BTMPH,#1) // Round towards pos/neg inf?
|
|
if (!p0.new) AL = #0 // If not, zero
|
|
BTMPH ^= lsr(AH,#31) // rounding my way --> set LSB
|
|
}
|
|
{
|
|
p0 = cmp.eq(BTMPH,#3) // if rounding towards right inf
|
|
if (!p0.new) AL = #0 // don't go to zero
|
|
jumpr r31
|
|
}
|
|
.Linvalid_zeroinf:
|
|
{
|
|
TMP = USR
|
|
}
|
|
{
|
|
A = #-1
|
|
TMP = or(TMP,#2)
|
|
}
|
|
{
|
|
USR = TMP
|
|
}
|
|
{
|
|
p0 = dfcmp.uo(A,A) // force exception if enabled
|
|
jumpr r31
|
|
}
|
|
.Linvalid_nan:
|
|
{
|
|
p0 = dfclass(B,#0x0f) // if B is not NaN
|
|
TMP = convert_df2sf(A) // will generate invalid if sNaN
|
|
if (p0.new) B = A // make it whatever A is
|
|
}
|
|
{
|
|
BL = convert_df2sf(B) // will generate invalid if sNaN
|
|
A = #-1
|
|
jumpr r31
|
|
}
|
|
.falign
|
|
.Ltrue_zero:
|
|
{
|
|
A = B
|
|
B = A
|
|
}
|
|
.Ltrue_inf:
|
|
{
|
|
BH = extract(BH,#1,#31)
|
|
}
|
|
{
|
|
AH ^= asl(BH,#31)
|
|
jumpr r31
|
|
}
|
|
END(__hexagon_muldf3)
|
|
|
|
#undef ATMP
|
|
#undef ATMPL
|
|
#undef ATMPH
|
|
#undef BTMP
|
|
#undef BTMPL
|
|
#undef BTMPH
|