You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
124 lines
3.0 KiB
124 lines
3.0 KiB
/*
|
|
* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
|
|
*/
|
|
|
|
//see https://github.com/kunpengcompute/AvxToNeon
|
|
|
|
#ifndef PF_NEON_DBL_FROM_AVX_H
|
|
#define PF_NEON_DBL_FROM_AVX_H
|
|
#include <arm_neon.h>
|
|
|
|
|
|
#if defined(__GNUC__) || defined(__clang__)
|
|
|
|
#pragma push_macro("FORCE_INLINE")
|
|
#define FORCE_INLINE static inline __attribute__((always_inline))
|
|
|
|
#else
|
|
|
|
#error "Macro name collisions may happens with unknown compiler"
|
|
#ifdef FORCE_INLINE
|
|
#undef FORCE_INLINE
|
|
#endif
|
|
|
|
#define FORCE_INLINE static inline
|
|
|
|
#endif
|
|
|
|
typedef struct {
|
|
float32x4_t vect_f32[2];
|
|
} __m256;
|
|
|
|
typedef struct {
|
|
float64x2_t vect_f64[2];
|
|
} __m256d;
|
|
|
|
typedef float64x2_t __m128d;
|
|
|
|
FORCE_INLINE __m256d _mm256_setzero_pd(void)
|
|
{
|
|
__m256d ret;
|
|
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
|
|
return ret;
|
|
}
|
|
|
|
FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
|
|
{
|
|
__m256d res_m256d;
|
|
res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
|
|
res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
|
|
return res_m256d;
|
|
}
|
|
|
|
FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
|
|
{
|
|
__m256d res_m256d;
|
|
res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
|
|
res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
|
|
return res_m256d;
|
|
}
|
|
|
|
FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
|
|
{
|
|
__m256d res_m256d;
|
|
res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
|
|
res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
|
|
return res_m256d;
|
|
}
|
|
|
|
FORCE_INLINE __m256d _mm256_set1_pd(double a)
|
|
{
|
|
__m256d ret;
|
|
ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
|
|
return ret;
|
|
}
|
|
|
|
FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
|
|
{
|
|
__m256d res;
|
|
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
|
|
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
|
|
return res;
|
|
}
|
|
FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
|
|
{
|
|
__m256d res;
|
|
res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
|
|
res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
|
|
return res;
|
|
}
|
|
|
|
FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
|
|
{
|
|
return a.vect_f64[0];
|
|
}
|
|
|
|
FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
|
|
{
|
|
assert(imm8 >= 0 && imm8 <= 1);
|
|
return a.vect_f64[imm8];
|
|
}
|
|
|
|
FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
|
|
{
|
|
__m256d res;
|
|
res.vect_f64[0] = a;
|
|
return res;
|
|
}
|
|
|
|
#endif /* PF_AVX_DBL_H */
|
|
|