You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

239 lines
7.5 KiB

/*
* Copyright (C) 2016 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
namespace android {
// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
#if USE_SSE
#define TO_STRING2(x) #x
#define TO_STRING(x) TO_STRING2(x)
// uncomment to print GCC version, may be relevant for intrinsic optimizations
/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
"." TO_STRING(__GNUC_MINOR__) \
"." TO_STRING(__GNUC_PATCHLEVEL__)) */
//
// SSEx specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
//
template <int CHANNELS, int STRIDE, bool FIXED>
static inline void ProcessSSEIntrinsic(float* out,
int count,
const float* coefsP,
const float* coefsN,
const float* sP,
const float* sN,
const float* volumeLR,
float lerpP,
const float* coefsP1,
const float* coefsN1)
{
ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
sP -= CHANNELS*(4-1); // adjust sP for a loop iteration of four
__m128 interp;
if (!FIXED) {
interp = _mm_set1_ps(lerpP);
}
__m128 accL, accR;
accL = _mm_setzero_ps();
if (CHANNELS == 2) {
accR = _mm_setzero_ps();
}
do {
__m128 posCoef = _mm_load_ps(coefsP);
__m128 negCoef = _mm_load_ps(coefsN);
coefsP += 4;
coefsN += 4;
if (!FIXED) { // interpolate
__m128 posCoef1 = _mm_load_ps(coefsP1);
__m128 negCoef1 = _mm_load_ps(coefsN1);
coefsP1 += 4;
coefsN1 += 4;
// Calculate the final coefficient for interpolation
// posCoef = interp * (posCoef1 - posCoef) + posCoef
// negCoef = interp * (negCoef - negCoef1) + negCoef1
posCoef1 = _mm_sub_ps(posCoef1, posCoef);
negCoef = _mm_sub_ps(negCoef, negCoef1);
#if USE_AVX2
posCoef = _mm_fmadd_ps(posCoef1, interp, posCoef);
negCoef = _mm_fmadd_ps(negCoef, interp, negCoef1);
#else
posCoef1 = _mm_mul_ps(posCoef1, interp);
negCoef = _mm_mul_ps(negCoef, interp);
posCoef = _mm_add_ps(posCoef1, posCoef);
negCoef = _mm_add_ps(negCoef, negCoef1);
#endif //USE_AVX2
}
switch (CHANNELS) {
case 1: {
__m128 posSamp = _mm_loadu_ps(sP);
__m128 negSamp = _mm_loadu_ps(sN);
sP -= 4;
sN += 4;
posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
#if USE_AVX2
accL = _mm_fmadd_ps(posSamp, posCoef, accL);
accL = _mm_fmadd_ps(negSamp, negCoef, accL);
#else
posSamp = _mm_mul_ps(posSamp, posCoef);
negSamp = _mm_mul_ps(negSamp, negCoef);
accL = _mm_add_ps(accL, posSamp);
accL = _mm_add_ps(accL, negSamp);
#endif
} break;
case 2: {
__m128 posSamp0 = _mm_loadu_ps(sP);
__m128 posSamp1 = _mm_loadu_ps(sP+4);
__m128 negSamp0 = _mm_loadu_ps(sN);
__m128 negSamp1 = _mm_loadu_ps(sN+4);
sP -= 8;
sN += 8;
// deinterleave everything and reverse the positives
__m128 posSampL = _mm_shuffle_ps(posSamp1, posSamp0, 0x22);
__m128 posSampR = _mm_shuffle_ps(posSamp1, posSamp0, 0x77);
__m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
__m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
#if USE_AVX2
accL = _mm_fmadd_ps(posSampL, posCoef, accL);
accR = _mm_fmadd_ps(posSampR, posCoef, accR);
accL = _mm_fmadd_ps(negSampL, negCoef, accL);
accR = _mm_fmadd_ps(negSampR, negCoef, accR);
#else
posSampL = _mm_mul_ps(posSampL, posCoef);
posSampR = _mm_mul_ps(posSampR, posCoef);
negSampL = _mm_mul_ps(negSampL, negCoef);
negSampR = _mm_mul_ps(negSampR, negCoef);
accL = _mm_add_ps(accL, posSampL);
accR = _mm_add_ps(accR, posSampR);
accL = _mm_add_ps(accL, negSampL);
accR = _mm_add_ps(accR, negSampR);
#endif
} break;
}
} while (count -= 4);
// multiply by volume and save
__m128 vLR = _mm_setzero_ps();
__m128 outSamp;
vLR = _mm_loadl_pi(vLR, reinterpret_cast<const __m64*>(volumeLR));
outSamp = _mm_loadl_pi(vLR, reinterpret_cast<__m64*>(out));
// combine and funnel down accumulator
__m128 outAccum = _mm_setzero_ps();
if (CHANNELS == 1) {
// duplicate accL to both L and R
outAccum = _mm_add_ps(accL, _mm_movehl_ps(accL, accL));
outAccum = _mm_add_ps(outAccum, _mm_shuffle_ps(outAccum, outAccum, 0x11));
} else if (CHANNELS == 2) {
// accR contains R, fold in
outAccum = _mm_hadd_ps(accL, accR);
outAccum = _mm_hadd_ps(outAccum, outAccum);
}
#if USE_AVX2
outSamp = _mm_fmadd_ps(outAccum, vLR,outSamp);
#else
outAccum = _mm_mul_ps(outAccum, vLR);
outSamp = _mm_add_ps(outSamp, outAccum);
#endif
_mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
}
template<>
inline void ProcessL<1, 16>(float* const out,
int count,
const float* coefsP,
const float* coefsN,
const float* sP,
const float* sN,
const float* const volumeLR)
{
ProcessSSEIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
}
template<>
inline void ProcessL<2, 16>(float* const out,
int count,
const float* coefsP,
const float* coefsN,
const float* sP,
const float* sN,
const float* const volumeLR)
{
ProcessSSEIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
}
template<>
inline void Process<1, 16>(float* const out,
int count,
const float* coefsP,
const float* coefsN,
const float* coefsP1,
const float* coefsN1,
const float* sP,
const float* sN,
float lerpP,
const float* const volumeLR)
{
ProcessSSEIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
lerpP, coefsP1, coefsN1);
}
template<>
inline void Process<2, 16>(float* const out,
int count,
const float* coefsP,
const float* coefsN,
const float* coefsP1,
const float* coefsN1,
const float* sP,
const float* sN,
float lerpP,
const float* const volumeLR)
{
ProcessSSEIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
lerpP, coefsP1, coefsN1);
}
#endif //USE_SSE
} // namespace android
#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H*/