7966 lines
250 KiB
7966 lines
250 KiB
// Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#ifndef GEMMLOWP_META_TRANSFORM_KERNELS_ARM_64_H_
|
|
#define GEMMLOWP_META_TRANSFORM_KERNELS_ARM_64_H_
|
|
|
|
#ifdef GEMMLOWP_NEON_64
|
|
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
|
|
namespace gemmlowp {
|
|
namespace meta {
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 0>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 0>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 1>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 1>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #1\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.b}[0], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 2>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 2>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #2\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.h}[0], [%x[output]], #2\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 3>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 3>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #3\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.h}[0], [%x[output]], #2\n"
|
|
"st1 {v0.b}[2], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 4>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 4>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #4\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 5>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 5>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #5\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v1.s}[0], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"st1 {v0.b}[4], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 6>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 6>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #6\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v1.2s}, [%x[input]], #8\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"st1 {v0.h}[2], [%x[output]], #2\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 7>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 7>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #7\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v1.2s}, [%x[input]], #8\n"
|
|
"ld1 {v1.s}[2], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"st1 {v0.h}[2], [%x[output]], #2\n"
|
|
"st1 {v0.b}[6], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 8>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 8>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #8\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 9>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 9>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #9\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
|
|
"ld1 {v2.s}[0], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.b}[8], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 10>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 10>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #10\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
|
|
"ld1 {v2.2s}, [%x[input]], #8\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.h}[4], [%x[output]], #2\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 11>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 11>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #11\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
|
|
"ld1 {v2.2s}, [%x[input]], #8\n"
|
|
"ld1 {v2.s}[2], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.h}[4], [%x[output]], #2\n"
|
|
"st1 {v0.b}[10], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 12>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 12>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #12\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 13>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 13>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #13\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
|
|
"ld1 {v3.s}[0], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"st1 {v0.b}[12], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 14>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 14>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #14\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
|
|
"ld1 {v3.2s}, [%x[input]], #8\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"st1 {v0.h}[6], [%x[output]], #2\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 15>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 15>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"dup v4.4s, %w[input_range_min]\n"
|
|
"dup v5.4s, %w[output_range_min]\n"
|
|
"dup v6.4s, %w[input_range_offset]\n"
|
|
"dup v7.4s, %w[input_range_scale]\n"
|
|
"dup v8.4s, %w[one_over_output_range_scale]\n"
|
|
"fsub v4.4s, v4.4s, v5.4s\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #15\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
|
|
"ld1 {v3.2s}, [%x[input]], #8\n"
|
|
"ld1 {v3.s}[2], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v6.4s\n"
|
|
"fsub v1.4s, v1.4s, v6.4s\n"
|
|
"fsub v2.4s, v2.4s, v6.4s\n"
|
|
"fsub v3.4s, v3.4s, v6.4s\n"
|
|
"fmul v0.4s, v0.4s, v7.4s\n"
|
|
"fmul v1.4s, v1.4s, v7.4s\n"
|
|
"fmul v2.4s, v2.4s, v7.4s\n"
|
|
"fmul v3.4s, v3.4s, v7.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v8.4s\n"
|
|
"fmul v1.4s, v1.4s, v8.4s\n"
|
|
"fmul v2.4s, v2.4s, v8.4s\n"
|
|
"fmul v3.4s, v3.4s, v8.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"st1 {v0.h}[6], [%x[output]], #2\n"
|
|
"st1 {v0.b}[14], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 0>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 0>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 1>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 1>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #1\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.b}[0], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 2>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 2>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #2\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.h}[0], [%x[output]], #2\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 3>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 3>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #3\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.h}[0], [%x[output]], #2\n"
|
|
"st1 {v0.b}[2], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 4>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 4>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #4\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 5>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 5>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #5\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v1.s}[0], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"st1 {v0.b}[4], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 6>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 6>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #6\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v1.2s}, [%x[input]], #8\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"st1 {v0.h}[2], [%x[output]], #2\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 7>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 7>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #7\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v1.2s}, [%x[input]], #8\n"
|
|
"ld1 {v1.s}[2], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"st1 {v0.h}[2], [%x[output]], #2\n"
|
|
"st1 {v0.b}[6], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 8>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 8>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #8\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 9>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 9>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #9\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
|
|
"ld1 {v2.s}[0], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.b}[8], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 10>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 10>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #10\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
|
|
"ld1 {v2.2s}, [%x[input]], #8\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.h}[4], [%x[output]], #2\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 11>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 11>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #11\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s}, [%x[input]], #32\n"
|
|
"ld1 {v2.2s}, [%x[input]], #8\n"
|
|
"ld1 {v2.s}[2], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.h}[4], [%x[output]], #2\n"
|
|
"st1 {v0.b}[10], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 12>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 12>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #12\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 13>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 13>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #13\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
|
|
"ld1 {v3.s}[0], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"st1 {v0.b}[12], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 14>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 14>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #14\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
|
|
"ld1 {v3.2s}, [%x[input]], #8\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"st1 {v0.h}[6], [%x[output]], #2\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 15>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 15>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #15\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[input]], #64\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"ld1 {v0.4s, v1.4s, v2.4s}, [%x[input]], #48\n"
|
|
"ld1 {v3.2s}, [%x[input]], #8\n"
|
|
"ld1 {v3.s}[2], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #64]\n"
|
|
"fsub v0.4s, v0.4s, v4.4s\n"
|
|
"fsub v1.4s, v1.4s, v4.4s\n"
|
|
"fsub v2.4s, v2.4s, v4.4s\n"
|
|
"fsub v3.4s, v3.4s, v4.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v5.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fadd v3.4s, v3.4s, v5.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
"sqxtn v0.4h, v0.4s\n"
|
|
"sqxtn2 v0.8h, v1.4s\n"
|
|
"sqxtn v2.4h, v2.4s\n"
|
|
"sqxtn2 v2.8h, v3.4s\n"
|
|
"sqxtun v0.8b, v0.8h\n"
|
|
"sqxtun2 v0.16b, v2.8h\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"st1 {v0.h}[6], [%x[output]], #2\n"
|
|
"st1 {v0.b}[14], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 0>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 0>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 1>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 1>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #1\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.b}[0], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 2>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 2>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #2\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.h}[0], [%x[input]], #2\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 3>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 3>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #3\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.h}[0], [%x[input]], #2\n"
|
|
"ld1 {v0.b}[2], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 4>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 4>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #4\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 5>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 5>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #5\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"ld1 {v0.b}[4], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"st1 {v1.s}[0], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 6>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 6>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #6\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"ld1 {v0.h}[2], [%x[input]], #2\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"st1 {v1.2s}, [%x[output]], #8\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 7>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 7>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #7\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"ld1 {v0.h}[2], [%x[input]], #2\n"
|
|
"ld1 {v0.b}[6], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"st1 {v1.2s}, [%x[output]], #8\n"
|
|
"st1 {v1.s}[2], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 8>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 8>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #8\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 9>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 9>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #9\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.b}[8], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
|
|
"st1 {v2.s}[0], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 10>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 10>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #10\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.h}[4], [%x[input]], #2\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
|
|
"st1 {v2.2s}, [%x[output]], #8\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 11>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 11>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #11\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.h}[4], [%x[input]], #2\n"
|
|
"ld1 {v0.b}[10], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
|
|
"st1 {v2.2s}, [%x[output]], #8\n"
|
|
"st1 {v2.s}[2], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 12>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 12>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #12\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 13>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 13>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #13\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"ld1 {v0.b}[12], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
|
|
"st1 {v3.s}[0], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 14>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 14>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #14\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"ld1 {v0.h}[6], [%x[input]], #2\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
|
|
"st1 {v3.2s}, [%x[output]], #8\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 15>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 15>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"dup v4.4s, %w[range_min]\n"
|
|
"dup v5.4s, %w[range_offset]\n"
|
|
"dup v6.4s, %w[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #15\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"ld1 {v0.h}[6], [%x[input]], #2\n"
|
|
"ld1 {v0.b}[14], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v5.4s\n"
|
|
"fsub v1.4s, v1.4s, v5.4s\n"
|
|
"fsub v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v3.4s, v3.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v6.4s\n"
|
|
"fmul v1.4s, v1.4s, v6.4s\n"
|
|
"fmul v2.4s, v2.4s, v6.4s\n"
|
|
"fmul v3.4s, v3.4s, v6.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v4.4s\n"
|
|
"fadd v3.4s, v3.4s, v4.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
|
|
"st1 {v3.2s}, [%x[output]], #8\n"
|
|
"st1 {v3.s}[2], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
0>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"0>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
1>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"1>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #1\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.b}[0], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.b}[0], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
2>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"2>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #2\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.h}[0], [%x[input]], #2\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.h}[0], [%x[output]], #2\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
3>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"3>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #3\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.h}[0], [%x[input]], #2\n"
|
|
"ld1 {v0.b}[2], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.h}[0], [%x[output]], #2\n"
|
|
"st1 {v0.b}[2], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
4>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"4>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #4\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
5>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"5>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #5\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"ld1 {v0.b}[4], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"st1 {v0.b}[4], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
6>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"6>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #6\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"ld1 {v0.h}[2], [%x[input]], #2\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"st1 {v0.h}[2], [%x[output]], #2\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
7>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"7>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #7\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"ld1 {v0.h}[2], [%x[input]], #2\n"
|
|
"ld1 {v0.b}[6], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"st1 {v0.h}[2], [%x[output]], #2\n"
|
|
"st1 {v0.b}[6], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
8>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"8>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #8\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
9>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"9>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #9\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.b}[8], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.b}[8], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
10>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"10>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #10\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.h}[4], [%x[input]], #2\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.h}[4], [%x[output]], #2\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
11>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"11>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #11\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.h}[4], [%x[input]], #2\n"
|
|
"ld1 {v0.b}[10], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.h}[4], [%x[output]], #2\n"
|
|
"st1 {v0.b}[10], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
12>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"12>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #12\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
13>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"13>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #13\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"ld1 {v0.b}[12], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"st1 {v0.b}[12], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
14>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"14>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #14\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"ld1 {v0.h}[6], [%x[input]], #2\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"st1 {v0.h}[6], [%x[output]], #2\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
15>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"15>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"dup v4.16b, %w[min]\n"
|
|
"dup v5.16b, %w[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %x[count], %x[count], #15\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %x[count], %x[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"ld1 {v0.h}[6], [%x[input]], #2\n"
|
|
"ld1 {v0.b}[14], [%x[input]], #1\n"
|
|
"prfm pldl1keep, [%x[input], #16]\n"
|
|
"umax v0.16b, v0.16b, v4.16b\n"
|
|
"umin v0.16b, v0.16b, v5.16b\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"st1 {v0.h}[6], [%x[output]], #2\n"
|
|
"st1 {v0.b}[14], [%x[output]], #1\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "v0", "v4", "v5", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
0>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"0>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
1>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"1>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #1\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.b}[0], [%x[input]], #1\n"
|
|
"ld1 {v1.b}[0], [x1], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl v1.8h, v1.8b\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl v1.4s, v1.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v1.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
|
|
"st1 {v0.s}[0], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
2>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"2>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #2\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.h}[0], [%x[input]], #2\n"
|
|
"ld1 {v1.h}[0], [x1], #2\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl v1.8h, v1.8b\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl v1.4s, v1.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v1.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
3>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"3>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #3\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.h}[0], [%x[input]], #2\n"
|
|
"ld1 {v0.b}[2], [%x[input]], #1\n"
|
|
"ld1 {v1.h}[0], [x1], #2\n"
|
|
"ld1 {v1.b}[2], [x1], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl v1.8h, v1.8b\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl v1.4s, v1.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v1.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
|
|
"st1 {v0.2s}, [%x[output]], #8\n"
|
|
"st1 {v0.s}[2], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
4>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"4>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #4\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"ld1 {v1.s}[0], [x1], #4\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl v1.8h, v1.8b\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl v1.4s, v1.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v1.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
5>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"5>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #5\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"ld1 {v0.b}[4], [%x[input]], #1\n"
|
|
"ld1 {v2.s}[0], [x1], #4\n"
|
|
"ld1 {v2.b}[4], [x1], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl v2.8h, v2.8b\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v3.4s, v2.8h\n"
|
|
"sxtl v2.4s, v2.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v11.4s\n"
|
|
"fmul v3.4s, v3.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v10.4s\n"
|
|
"fadd v3.4s, v3.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v2.4s\n"
|
|
"fadd v1.4s, v1.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"st1 {v1.s}[0], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
6>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"6>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #6\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"ld1 {v0.h}[2], [%x[input]], #2\n"
|
|
"ld1 {v2.s}[0], [x1], #4\n"
|
|
"ld1 {v2.h}[2], [x1], #2\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl v2.8h, v2.8b\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v3.4s, v2.8h\n"
|
|
"sxtl v2.4s, v2.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v11.4s\n"
|
|
"fmul v3.4s, v3.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v10.4s\n"
|
|
"fadd v3.4s, v3.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v2.4s\n"
|
|
"fadd v1.4s, v1.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"st1 {v1.2s}, [%x[output]], #8\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
7>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"7>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #7\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.s}[0], [%x[input]], #4\n"
|
|
"ld1 {v0.h}[2], [%x[input]], #2\n"
|
|
"ld1 {v0.b}[6], [%x[input]], #1\n"
|
|
"ld1 {v2.s}[0], [x1], #4\n"
|
|
"ld1 {v2.h}[2], [x1], #2\n"
|
|
"ld1 {v2.b}[6], [x1], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl v2.8h, v2.8b\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v3.4s, v2.8h\n"
|
|
"sxtl v2.4s, v2.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v11.4s\n"
|
|
"fmul v3.4s, v3.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v10.4s\n"
|
|
"fadd v3.4s, v3.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v2.4s\n"
|
|
"fadd v1.4s, v1.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
|
|
"st1 {v0.4s}, [%x[output]], #16\n"
|
|
"st1 {v1.2s}, [%x[output]], #8\n"
|
|
"st1 {v1.s}[2], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
8>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"8>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #8\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v2.2s}, [x1], #8\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl v2.8h, v2.8b\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v3.4s, v2.8h\n"
|
|
"sxtl v2.4s, v2.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v11.4s\n"
|
|
"fmul v3.4s, v3.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v10.4s\n"
|
|
"fadd v3.4s, v3.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v2.4s\n"
|
|
"fadd v1.4s, v1.4s, v3.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
9>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"9>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #9\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.b}[8], [%x[input]], #1\n"
|
|
"ld1 {v3.2s}, [x1], #8\n"
|
|
"ld1 {v3.b}[8], [x1], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v4.8h, v3.16b\n"
|
|
"uxtl v3.8h, v3.8b\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl v5.4s, v4.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v4.4s, v3.8h\n"
|
|
"sxtl v3.4s, v3.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v11.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v10.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v3.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
|
|
"st1 {v2.s}[0], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
10>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"10>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #10\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.h}[4], [%x[input]], #2\n"
|
|
"ld1 {v3.2s}, [x1], #8\n"
|
|
"ld1 {v3.h}[4], [x1], #2\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v4.8h, v3.16b\n"
|
|
"uxtl v3.8h, v3.8b\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl v5.4s, v4.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v4.4s, v3.8h\n"
|
|
"sxtl v3.4s, v3.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v11.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v10.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v3.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
|
|
"st1 {v2.2s}, [%x[output]], #8\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
11>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"11>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #11\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.h}[4], [%x[input]], #2\n"
|
|
"ld1 {v0.b}[10], [%x[input]], #1\n"
|
|
"ld1 {v3.2s}, [x1], #8\n"
|
|
"ld1 {v3.h}[4], [x1], #2\n"
|
|
"ld1 {v3.b}[10], [x1], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v4.8h, v3.16b\n"
|
|
"uxtl v3.8h, v3.8b\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl v5.4s, v4.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v4.4s, v3.8h\n"
|
|
"sxtl v3.4s, v3.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v11.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v10.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v3.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s}, [%x[output]], #32\n"
|
|
"st1 {v2.2s}, [%x[output]], #8\n"
|
|
"st1 {v2.s}[2], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
12>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"12>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #12\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"ld1 {v3.2s}, [x1], #8\n"
|
|
"ld1 {v3.s}[2], [x1], #4\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v4.8h, v3.16b\n"
|
|
"uxtl v3.8h, v3.8b\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl v5.4s, v4.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v4.4s, v3.8h\n"
|
|
"sxtl v3.4s, v3.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v11.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v10.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v3.4s\n"
|
|
"fadd v1.4s, v1.4s, v4.4s\n"
|
|
"fadd v2.4s, v2.4s, v5.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
13>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"13>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #13\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"ld1 {v0.b}[12], [%x[input]], #1\n"
|
|
"ld1 {v4.2s}, [x1], #8\n"
|
|
"ld1 {v4.s}[2], [x1], #4\n"
|
|
"ld1 {v4.b}[12], [x1], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
|
|
"st1 {v3.s}[0], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
14>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"14>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #14\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"ld1 {v0.h}[6], [%x[input]], #2\n"
|
|
"ld1 {v4.2s}, [x1], #8\n"
|
|
"ld1 {v4.s}[2], [x1], #4\n"
|
|
"ld1 {v4.h}[6], [x1], #2\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
|
|
"st1 {v3.2s}, [%x[output]], #8\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
15>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"15>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr w0, %[input_range_min]\n"
|
|
"dup v8.4s, w0\n"
|
|
"ldr w0, %[input_range_scale]\n"
|
|
"dup v9.4s, w0\n"
|
|
"ldr w0, %[bias_range_min]\n"
|
|
"dup v10.4s, w0\n"
|
|
"ldr w0, %[bias_range_scale]\n"
|
|
"dup v11.4s, w0\n"
|
|
"ldr w0, %[output_range_min]\n"
|
|
"dup v12.4s, w0\n"
|
|
"ldr w0, %[one_over_output_range_scale]\n"
|
|
"dup v13.4s, w0\n"
|
|
"ldr w0, %[output_range_offset]\n"
|
|
"dup v14.4s, w0\n"
|
|
"1:"
|
|
"mov x0, %x[count]\n"
|
|
"mov x1, %x[bias]\n"
|
|
"subs x0, x0, #15\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs x0, x0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.4s}, [%x[input]], #16\n"
|
|
"ld1 {v4.4s}, [x1], #16\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%x[output]], #64\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"ld1 {v0.2s}, [%x[input]], #8\n"
|
|
"ld1 {v0.s}[2], [%x[input]], #4\n"
|
|
"ld1 {v0.h}[6], [%x[input]], #2\n"
|
|
"ld1 {v0.b}[14], [%x[input]], #1\n"
|
|
"ld1 {v4.2s}, [x1], #8\n"
|
|
"ld1 {v4.s}[2], [x1], #4\n"
|
|
"ld1 {v4.h}[6], [x1], #2\n"
|
|
"ld1 {v4.b}[14], [x1], #1\n"
|
|
"prfm pldl1keep, [%x[input], #32]\n"
|
|
"uxtl2 v1.8h, v0.16b\n"
|
|
"uxtl v0.8h, v0.8b\n"
|
|
"uxtl2 v5.8h, v4.16b\n"
|
|
"uxtl v4.8h, v4.8b\n"
|
|
"sxtl2 v3.4s, v1.8h\n"
|
|
"sxtl v2.4s, v1.4h\n"
|
|
"sxtl2 v7.4s, v5.8h\n"
|
|
"sxtl v6.4s, v5.4h\n"
|
|
"sxtl2 v1.4s, v0.8h\n"
|
|
"sxtl v0.4s, v0.4h\n"
|
|
"sxtl2 v5.4s, v4.8h\n"
|
|
"sxtl v4.4s, v4.4h\n"
|
|
"scvtf v0.4s, v0.4s\n"
|
|
"scvtf v1.4s, v1.4s\n"
|
|
"scvtf v2.4s, v2.4s\n"
|
|
"scvtf v3.4s, v3.4s\n"
|
|
"scvtf v4.4s, v4.4s\n"
|
|
"scvtf v5.4s, v5.4s\n"
|
|
"scvtf v6.4s, v6.4s\n"
|
|
"scvtf v7.4s, v7.4s\n"
|
|
"fmul v0.4s, v0.4s, v9.4s\n"
|
|
"fmul v1.4s, v1.4s, v9.4s\n"
|
|
"fmul v2.4s, v2.4s, v9.4s\n"
|
|
"fmul v3.4s, v3.4s, v9.4s\n"
|
|
"fmul v4.4s, v4.4s, v11.4s\n"
|
|
"fmul v5.4s, v5.4s, v11.4s\n"
|
|
"fmul v6.4s, v6.4s, v11.4s\n"
|
|
"fmul v7.4s, v7.4s, v11.4s\n"
|
|
"fadd v0.4s, v0.4s, v8.4s\n"
|
|
"fadd v1.4s, v1.4s, v8.4s\n"
|
|
"fadd v2.4s, v2.4s, v8.4s\n"
|
|
"fadd v3.4s, v3.4s, v8.4s\n"
|
|
"fadd v4.4s, v4.4s, v10.4s\n"
|
|
"fadd v5.4s, v5.4s, v10.4s\n"
|
|
"fadd v6.4s, v6.4s, v10.4s\n"
|
|
"fadd v7.4s, v7.4s, v10.4s\n"
|
|
"fadd v0.4s, v0.4s, v4.4s\n"
|
|
"fadd v1.4s, v1.4s, v5.4s\n"
|
|
"fadd v2.4s, v2.4s, v6.4s\n"
|
|
"fadd v3.4s, v3.4s, v7.4s\n"
|
|
"fsub v0.4s, v0.4s, v12.4s\n"
|
|
"fsub v1.4s, v1.4s, v12.4s\n"
|
|
"fsub v2.4s, v2.4s, v12.4s\n"
|
|
"fsub v3.4s, v3.4s, v12.4s\n"
|
|
"fmul v0.4s, v0.4s, v13.4s\n"
|
|
"fmul v1.4s, v1.4s, v13.4s\n"
|
|
"fmul v2.4s, v2.4s, v13.4s\n"
|
|
"fmul v3.4s, v3.4s, v13.4s\n"
|
|
"fadd v0.4s, v0.4s, v14.4s\n"
|
|
"fadd v1.4s, v1.4s, v14.4s\n"
|
|
"fadd v2.4s, v2.4s, v14.4s\n"
|
|
"fadd v3.4s, v3.4s, v14.4s\n"
|
|
"fcvtzs v0.4s, v0.4s\n"
|
|
"fcvtzs v1.4s, v1.4s\n"
|
|
"fcvtzs v2.4s, v2.4s\n"
|
|
"fcvtzs v3.4s, v3.4s\n"
|
|
|
|
"st1 {v0.4s, v1.4s, v2.4s}, [%x[output]], #48\n"
|
|
"st1 {v3.2s}, [%x[output]], #8\n"
|
|
"st1 {v3.s}[2], [%x[output]], #4\n"
|
|
"prfm pldl1keep, [%x[output]]\n"
|
|
"subs %x[rows], %x[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
|
|
"v10", "v11", "v12", "v13", "v14", "cc", "memory");
|
|
}
|
|
|
|
} // namespace meta
|
|
} // namespace gemmlowp
|
|
|
|
#else
|
|
#warning "Meta gemm for arm64 requires: GEMMLOWP_NEON_64!"
|
|
#endif
|
|
|
|
#endif // GEMMLOWP_META_TRANSFORM_KERNELS_ARM_64_H_
|