You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
8110 lines
242 KiB
8110 lines
242 KiB
// Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#ifndef GEMMLOWP_META_TRANSFORM_KERNELS_ARM_32_H_
|
|
#define GEMMLOWP_META_TRANSFORM_KERNELS_ARM_32_H_
|
|
|
|
#ifdef GEMMLOWP_NEON_32
|
|
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
|
|
namespace gemmlowp {
|
|
namespace meta {
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 0>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 0>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 1>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 1>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #1\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.8 {d0[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 2>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 2>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #2\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.16 {d0[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 3>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 3>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #3\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.16 {d0[0]}, [%[output]]!\n"
|
|
"vst1.8 {d0[2]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 4>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 4>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #4\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 5>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 5>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #5\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d2[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"vst1.8 {d0[4]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 6>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 6>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #6\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"vst1.16 {d0[2]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 7>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 7>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #7\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2}, [%[input]]!\n"
|
|
"vld1.32 {d3[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"vst1.16 {d0[2]}, [%[output]]!\n"
|
|
"vst1.8 {d0[6]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 8>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 8>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #8\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 9>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 9>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #9\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.8 {d1[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 10>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 10>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #10\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.16 {d1[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 11>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 11>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #11\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4}, [%[input]]!\n"
|
|
"vld1.32 {d5[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.16 {d1[0]}, [%[output]]!\n"
|
|
"vst1.8 {d1[2]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 12>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 12>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #12\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 13>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 13>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #13\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5}, [%[input]]!\n"
|
|
"vld1.32 {d6[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"vst1.8 {d1[4]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 14>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 14>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #14\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"vst1.16 {d1[2]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<int32_t, uint8_t, Requantize, 16, 15>::Transform(
|
|
const int32_t* input, const Requantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Requantize<int32_t, uint8_t, Requantize, 16, 15>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Requantize::Prepare
|
|
"vdup.32 q4, %[input_range_min]\n"
|
|
"vdup.32 q5, %[output_range_min]\n"
|
|
"vdup.32 q6, %[input_range_offset]\n"
|
|
"vdup.32 q7, %[input_range_scale]\n"
|
|
"vdup.32 q8, %[one_over_output_range_scale]\n"
|
|
"vsub.f32 q4, q4, q5\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #15\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Requantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6}, [%[input]]!\n"
|
|
"vld1.32 {d7[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q6\n"
|
|
"vsub.f32 q1, q1, q6\n"
|
|
"vsub.f32 q2, q2, q6\n"
|
|
"vsub.f32 q3, q3, q6\n"
|
|
"vmul.f32 q0, q0, q7\n"
|
|
"vmul.f32 q1, q1, q7\n"
|
|
"vmul.f32 q2, q2, q7\n"
|
|
"vmul.f32 q3, q3, q7\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q8\n"
|
|
"vmul.f32 q1, q1, q8\n"
|
|
"vmul.f32 q2, q2, q8\n"
|
|
"vmul.f32 q3, q3, q8\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"vst1.16 {d1[2]}, [%[output]]!\n"
|
|
"vst1.8 {d1[6]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [input_range_min] "r"(params.input_range_min),
|
|
[output_range_min] "r"(params.output_range_min),
|
|
[input_range_offset] "r"(params.input_range_offset),
|
|
[one_over_output_range_scale] "r"(params.one_over_output_range_scale),
|
|
[input_range_scale] "r"(params.input_range_scale)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "d14", "d15", "d16", "d17", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 0>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 0>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 1>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 1>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #1\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.8 {d0[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 2>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 2>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #2\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.16 {d0[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 3>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 3>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #3\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.16 {d0[0]}, [%[output]]!\n"
|
|
"vst1.8 {d0[2]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 4>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 4>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #4\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 5>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 5>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #5\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d2[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"vst1.8 {d0[4]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 6>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 6>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #6\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"vst1.16 {d0[2]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 7>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 7>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #7\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2}, [%[input]]!\n"
|
|
"vld1.32 {d3[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"vst1.16 {d0[2]}, [%[output]]!\n"
|
|
"vst1.8 {d0[6]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 8>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 8>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #8\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 9>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 9>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #9\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.8 {d1[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 10>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 10>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #10\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.16 {d1[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 11>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 11>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #11\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4}, [%[input]]!\n"
|
|
"vld1.32 {d5[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.16 {d1[0]}, [%[output]]!\n"
|
|
"vst1.8 {d1[2]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 12>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 12>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #12\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 13>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 13>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #13\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5}, [%[input]]!\n"
|
|
"vld1.32 {d6[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"vst1.8 {d1[4]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 14>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 14>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #14\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"vst1.16 {d1[2]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<float, uint8_t, Quantize, 16, 15>::Transform(
|
|
const float* input, const Quantize& params, uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Quantize<float, uint8_t, Quantize, 16, 15>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Quantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #15\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6, d7}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Quantize::Transform
|
|
"vld1.32 {d0, d1, d2, d3}, [%[input]]!\n"
|
|
"vld1.32 {d4, d5, d6}, [%[input]]!\n"
|
|
"vld1.32 {d7[0]}, [%[input]]!\n"
|
|
"pld [%[input], #64]\n"
|
|
"vsub.f32 q0, q0, q4\n"
|
|
"vsub.f32 q1, q1, q4\n"
|
|
"vsub.f32 q2, q2, q4\n"
|
|
"vsub.f32 q3, q3, q4\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q5\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vadd.f32 q3, q3, q5\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
"vqmovn.s32 d0, q0\n"
|
|
"vqmovn.s32 d1, q1\n"
|
|
"vqmovn.s32 d4, q2\n"
|
|
"vqmovn.s32 d5, q3\n"
|
|
"vqmovun.s16 d0, q0\n"
|
|
"vqmovun.s16 d1, q2\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"vst1.16 {d1[2]}, [%[output]]!\n"
|
|
"vst1.8 {d1[6]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 0>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 0>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 1>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 1>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #1\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.8 {d0[0]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 2>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 2>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #2\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.16 {d0[0]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 3>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 3>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #3\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.16 {d0[0]}, [%[input]]!\n"
|
|
"vld1.8 {d0[2]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 4>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 4>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #4\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 5>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 5>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #5\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"vld1.8 {d0[4]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"vst1.32 {d2[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 6>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 6>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #6\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"vld1.16 {d0[2]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 7>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 7>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #7\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"vld1.16 {d0[2]}, [%[input]]!\n"
|
|
"vld1.8 {d0[6]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2}, [%[output]]!\n"
|
|
"vst1.32 {d3[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 8>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 8>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #8\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 9>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 9>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #9\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.8 {d1[0]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 10>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 10>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #10\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.16 {d1[0]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 11>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 11>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #11\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.16 {d1[0]}, [%[input]]!\n"
|
|
"vld1.8 {d1[2]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4}, [%[output]]!\n"
|
|
"vst1.32 {d5[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 12>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 12>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #12\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 13>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 13>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #13\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"vld1.8 {d1[4]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5}, [%[output]]!\n"
|
|
"vst1.32 {d6[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 14>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 14>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #14\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"vld1.16 {d1[2]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, float, Dequantize, 16, 15>::Transform(
|
|
const uint8_t* input, const Dequantize& params, float* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") Dequantize<uint8_t, float, Dequantize, 16, 15>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// Dequantize::Prepare
|
|
"vdup.32 q4, %[range_min]\n"
|
|
"vdup.32 q5, %[range_offset]\n"
|
|
"vdup.32 q6, %[range_scale]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #15\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// Dequantize::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"vld1.16 {d1[2]}, [%[input]]!\n"
|
|
"vld1.8 {d1[6]}, [%[input]]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vsub.f32 q0, q0, q5\n"
|
|
"vsub.f32 q1, q1, q5\n"
|
|
"vsub.f32 q2, q2, q5\n"
|
|
"vsub.f32 q3, q3, q5\n"
|
|
"vmul.f32 q0, q0, q6\n"
|
|
"vmul.f32 q1, q1, q6\n"
|
|
"vmul.f32 q2, q2, q6\n"
|
|
"vmul.f32 q3, q3, q6\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q4\n"
|
|
"vadd.f32 q3, q3, q4\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6}, [%[output]]!\n"
|
|
"vst1.32 {d7[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [range_offset] "r"(params.range_offset),
|
|
[range_scale] "r"(params.range_scale), [range_min] "r"(params.range_min)
|
|
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
|
|
"d11", "d12", "d13", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
0>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"0>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
1>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"1>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #1\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.8 {d0[0]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.8 {d0[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
2>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"2>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #2\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.16 {d0[0]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.16 {d0[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
3>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"3>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #3\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.16 {d0[0]}, [%[input]]!\n"
|
|
"vld1.8 {d0[2]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.16 {d0[0]}, [%[output]]!\n"
|
|
"vst1.8 {d0[2]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
4>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"4>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #4\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
5>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"5>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #5\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"vld1.8 {d0[4]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"vst1.8 {d0[4]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
6>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"6>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #6\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"vld1.16 {d0[2]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"vst1.16 {d0[2]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
7>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"7>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #7\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"vld1.16 {d0[2]}, [%[input]]!\n"
|
|
"vld1.8 {d0[6]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"vst1.16 {d0[2]}, [%[output]]!\n"
|
|
"vst1.8 {d0[6]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
8>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"8>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #8\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
9>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"9>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #9\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.8 {d1[0]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.8 {d1[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
10>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"10>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #10\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.16 {d1[0]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.16 {d1[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
11>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"11>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #11\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.16 {d1[0]}, [%[input]]!\n"
|
|
"vld1.8 {d1[2]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.16 {d1[0]}, [%[output]]!\n"
|
|
"vst1.8 {d1[2]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
12>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"12>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #12\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
13>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"13>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #13\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"vld1.8 {d1[4]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"vst1.8 {d1[4]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
14>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"14>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #14\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"vld1.16 {d1[2]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"vst1.16 {d1[2]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, uint8_t, MinMax<uint8_t>, 16,
|
|
15>::Transform(const uint8_t* input,
|
|
const MinMax<uint8_t>& params,
|
|
uint8_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") MinMax<uint8_t><uint8_t, uint8_t, MinMax<uint8_t>, 16, "
|
|
"15>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_count_copy = params.count;
|
|
asm volatile(
|
|
|
|
// MinMax::Prepare
|
|
"vdup.8 q4, %[min]\n"
|
|
"vdup.8 q5, %[max]\n"
|
|
|
|
// Reduce count by leftovers.
|
|
"subs %[count], %[count], #15\n"
|
|
"beq 2f\n"
|
|
|
|
"1:"
|
|
"subs %[count], %[count], #16\n"
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
|
|
"bne 1b\n"
|
|
"2:"
|
|
|
|
// Handle leftovers.
|
|
|
|
// MinMax::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"vld1.16 {d1[2]}, [%[input]]!\n"
|
|
"vld1.8 {d1[6]}, [%[input]]!\n"
|
|
"pld [%[input], #16]\n"
|
|
"vmax.u8 q0, q0, q4\n"
|
|
"vmin.u8 q0, q0, q5\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"vst1.16 {d1[2]}, [%[output]]!\n"
|
|
"vst1.8 {d1[6]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
: [count] "+r"(params_count_copy), [input] "+r"(input),
|
|
[output] "+r"(output)
|
|
: [max] "r"(params.max), [min] "r"(params.min)
|
|
: "d0", "d1", "d8", "d9", "d10", "d11", "cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
0>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"0>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
1>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"1>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #1\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.8 {d0[0]}, [%[input]]!\n"
|
|
"vld1.8 {d2[0]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q1, d2\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q1, d2\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q10\n"
|
|
"vadd.f32 q0, q0, q1\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
|
|
"vst1.32 {d0[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
2>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"2>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #2\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.16 {d0[0]}, [%[input]]!\n"
|
|
"vld1.16 {d2[0]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q1, d2\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q1, d2\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q10\n"
|
|
"vadd.f32 q0, q0, q1\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
3>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"3>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #3\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.16 {d0[0]}, [%[input]]!\n"
|
|
"vld1.8 {d0[2]}, [%[input]]!\n"
|
|
"vld1.16 {d2[0]}, [r1]!\n"
|
|
"vld1.8 {d2[2]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q1, d2\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q1, d2\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q10\n"
|
|
"vadd.f32 q0, q0, q1\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
|
|
"vst1.32 {d0}, [%[output]]!\n"
|
|
"vst1.32 {d1[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
4>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"4>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #4\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"vld1.32 {d2[0]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q1, d2\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q1, d2\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q10\n"
|
|
"vadd.f32 q0, q0, q1\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
5>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"5>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #5\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"vld1.8 {d0[4]}, [%[input]]!\n"
|
|
"vld1.32 {d4[0]}, [r1]!\n"
|
|
"vld1.8 {d4[4]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q2, d4\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q3, d5\n"
|
|
"vmovl.s16 q2, d4\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q11\n"
|
|
"vmul.f32 q3, q3, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q10\n"
|
|
"vadd.f32 q3, q3, q10\n"
|
|
"vadd.f32 q0, q0, q2\n"
|
|
"vadd.f32 q1, q1, q3\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
|
|
"vst1.32 {d0, d1}, [%[output]]!\n"
|
|
"vst1.32 {d2[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
6>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"6>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #6\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"vld1.16 {d0[2]}, [%[input]]!\n"
|
|
"vld1.32 {d4[0]}, [r1]!\n"
|
|
"vld1.16 {d4[2]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q2, d4\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q3, d5\n"
|
|
"vmovl.s16 q2, d4\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q11\n"
|
|
"vmul.f32 q3, q3, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q10\n"
|
|
"vadd.f32 q3, q3, q10\n"
|
|
"vadd.f32 q0, q0, q2\n"
|
|
"vadd.f32 q1, q1, q3\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
|
|
"vst1.32 {d0, d1, d2}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
7>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"7>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #7\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0[0]}, [%[input]]!\n"
|
|
"vld1.16 {d0[2]}, [%[input]]!\n"
|
|
"vld1.8 {d0[6]}, [%[input]]!\n"
|
|
"vld1.32 {d4[0]}, [r1]!\n"
|
|
"vld1.16 {d4[2]}, [r1]!\n"
|
|
"vld1.8 {d4[6]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q2, d4\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q3, d5\n"
|
|
"vmovl.s16 q2, d4\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q11\n"
|
|
"vmul.f32 q3, q3, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q10\n"
|
|
"vadd.f32 q3, q3, q10\n"
|
|
"vadd.f32 q0, q0, q2\n"
|
|
"vadd.f32 q1, q1, q3\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
|
|
"vst1.32 {d0, d1, d2}, [%[output]]!\n"
|
|
"vst1.32 {d3[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
8>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"8>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #8\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d4}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q2, d4\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q3, d5\n"
|
|
"vmovl.s16 q2, d4\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q11\n"
|
|
"vmul.f32 q3, q3, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q10\n"
|
|
"vadd.f32 q3, q3, q10\n"
|
|
"vadd.f32 q0, q0, q2\n"
|
|
"vadd.f32 q1, q1, q3\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
9>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"9>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #9\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.8 {d1[0]}, [%[input]]!\n"
|
|
"vld1.32 {d6}, [r1]!\n"
|
|
"vld1.8 {d7[0]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q4, d7\n"
|
|
"vmovl.u8 q3, d6\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q5, d8\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q4, d7\n"
|
|
"vmovl.s16 q3, d6\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q11\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q10\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q0, q0, q3\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
10>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"10>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #10\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.16 {d1[0]}, [%[input]]!\n"
|
|
"vld1.32 {d6}, [r1]!\n"
|
|
"vld1.16 {d7[0]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q4, d7\n"
|
|
"vmovl.u8 q3, d6\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q5, d8\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q4, d7\n"
|
|
"vmovl.s16 q3, d6\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q11\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q10\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q0, q0, q3\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
11>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"11>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #11\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.16 {d1[0]}, [%[input]]!\n"
|
|
"vld1.8 {d1[2]}, [%[input]]!\n"
|
|
"vld1.32 {d6}, [r1]!\n"
|
|
"vld1.16 {d7[0]}, [r1]!\n"
|
|
"vld1.8 {d7[2]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q4, d7\n"
|
|
"vmovl.u8 q3, d6\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q5, d8\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q4, d7\n"
|
|
"vmovl.s16 q3, d6\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q11\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q10\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q0, q0, q3\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4}, [%[output]]!\n"
|
|
"vst1.32 {d5[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
12>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"12>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #12\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"vld1.32 {d6}, [r1]!\n"
|
|
"vld1.32 {d7[0]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q4, d7\n"
|
|
"vmovl.u8 q3, d6\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q5, d8\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q4, d7\n"
|
|
"vmovl.s16 q3, d6\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q11\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q10\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q0, q0, q3\n"
|
|
"vadd.f32 q1, q1, q4\n"
|
|
"vadd.f32 q2, q2, q5\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
13>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"13>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #13\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"vld1.8 {d1[4]}, [%[input]]!\n"
|
|
"vld1.32 {d8}, [r1]!\n"
|
|
"vld1.32 {d9[0]}, [r1]!\n"
|
|
"vld1.8 {d9[4]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5}, [%[output]]!\n"
|
|
"vst1.32 {d6[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
14>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"14>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #14\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"vld1.16 {d1[2]}, [%[input]]!\n"
|
|
"vld1.32 {d8}, [r1]!\n"
|
|
"vld1.32 {d9[0]}, [r1]!\n"
|
|
"vld1.16 {d9[2]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
template <>
|
|
inline void Transform1DKernel<uint8_t, int32_t, BiasAdd<uint8_t>, 16,
|
|
15>::Transform(const uint8_t* input,
|
|
const BiasAdd<uint8_t>& params,
|
|
int32_t* output) {
|
|
#ifdef DEBUG
|
|
#ifdef DEBUG_METAGEMM_VERBOSE
|
|
std::cout << __FILE__ << "(" << __LINE__
|
|
<< ") BiasAdd<uint8_t><uint8_t, int32_t, BiasAdd<uint8_t>, 16, "
|
|
"15>::Transform()"
|
|
<< std::endl
|
|
<< std::flush;
|
|
#endif
|
|
#endif
|
|
int params_rows_copy = params.rows;
|
|
asm volatile(
|
|
"ldr r0, %[input_range_min]\n"
|
|
"vdup.32 q8, r0\n"
|
|
"ldr r0, %[input_range_scale]\n"
|
|
"vdup.32 q9, r0\n"
|
|
"ldr r0, %[bias_range_min]\n"
|
|
"vdup.32 q10, r0\n"
|
|
"ldr r0, %[bias_range_scale]\n"
|
|
"vdup.32 q11, r0\n"
|
|
"ldr r0, %[output_range_min]\n"
|
|
"vdup.32 q12, r0\n"
|
|
"ldr r0, %[one_over_output_range_scale]\n"
|
|
"vdup.32 q13, r0\n"
|
|
"ldr r0, %[output_range_offset]\n"
|
|
"vdup.32 q14, r0\n"
|
|
"1:"
|
|
"mov r0, %[count]\n"
|
|
"mov r1, %[bias]\n"
|
|
"subs r0, r0, #15\n"
|
|
"beq 3f\n"
|
|
"2:"
|
|
"subs r0, r0, #16\n"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0, d1}, [%[input]]!\n"
|
|
"vld1.32 {d8, d9}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6, d7}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"bne 2b\n"
|
|
"3:"
|
|
|
|
// BiasAdd::Transform
|
|
"vld1.32 {d0}, [%[input]]!\n"
|
|
"vld1.32 {d1[0]}, [%[input]]!\n"
|
|
"vld1.16 {d1[2]}, [%[input]]!\n"
|
|
"vld1.8 {d1[6]}, [%[input]]!\n"
|
|
"vld1.32 {d8}, [r1]!\n"
|
|
"vld1.32 {d9[0]}, [r1]!\n"
|
|
"vld1.16 {d9[2]}, [r1]!\n"
|
|
"vld1.8 {d9[6]}, [r1]!\n"
|
|
"pld [%[input], #32]\n"
|
|
"vmovl.u8 q1, d1\n"
|
|
"vmovl.u8 q0, d0\n"
|
|
"vmovl.u8 q5, d9\n"
|
|
"vmovl.u8 q4, d8\n"
|
|
"vmovl.s16 q3, d3\n"
|
|
"vmovl.s16 q2, d2\n"
|
|
"vmovl.s16 q7, d11\n"
|
|
"vmovl.s16 q6, d10\n"
|
|
"vmovl.s16 q1, d1\n"
|
|
"vmovl.s16 q0, d0\n"
|
|
"vmovl.s16 q5, d9\n"
|
|
"vmovl.s16 q4, d8\n"
|
|
"vcvt.f32.s32 q0, q0\n"
|
|
"vcvt.f32.s32 q1, q1\n"
|
|
"vcvt.f32.s32 q2, q2\n"
|
|
"vcvt.f32.s32 q3, q3\n"
|
|
"vcvt.f32.s32 q4, q4\n"
|
|
"vcvt.f32.s32 q5, q5\n"
|
|
"vcvt.f32.s32 q6, q6\n"
|
|
"vcvt.f32.s32 q7, q7\n"
|
|
"vmul.f32 q0, q0, q9\n"
|
|
"vmul.f32 q1, q1, q9\n"
|
|
"vmul.f32 q2, q2, q9\n"
|
|
"vmul.f32 q3, q3, q9\n"
|
|
"vmul.f32 q4, q4, q11\n"
|
|
"vmul.f32 q5, q5, q11\n"
|
|
"vmul.f32 q6, q6, q11\n"
|
|
"vmul.f32 q7, q7, q11\n"
|
|
"vadd.f32 q0, q0, q8\n"
|
|
"vadd.f32 q1, q1, q8\n"
|
|
"vadd.f32 q2, q2, q8\n"
|
|
"vadd.f32 q3, q3, q8\n"
|
|
"vadd.f32 q4, q4, q10\n"
|
|
"vadd.f32 q5, q5, q10\n"
|
|
"vadd.f32 q6, q6, q10\n"
|
|
"vadd.f32 q7, q7, q10\n"
|
|
"vadd.f32 q0, q0, q4\n"
|
|
"vadd.f32 q1, q1, q5\n"
|
|
"vadd.f32 q2, q2, q6\n"
|
|
"vadd.f32 q3, q3, q7\n"
|
|
"vsub.f32 q0, q0, q12\n"
|
|
"vsub.f32 q1, q1, q12\n"
|
|
"vsub.f32 q2, q2, q12\n"
|
|
"vsub.f32 q3, q3, q12\n"
|
|
"vmul.f32 q0, q0, q13\n"
|
|
"vmul.f32 q1, q1, q13\n"
|
|
"vmul.f32 q2, q2, q13\n"
|
|
"vmul.f32 q3, q3, q13\n"
|
|
"vadd.f32 q0, q0, q14\n"
|
|
"vadd.f32 q1, q1, q14\n"
|
|
"vadd.f32 q2, q2, q14\n"
|
|
"vadd.f32 q3, q3, q14\n"
|
|
"vcvt.s32.f32 q0, q0\n"
|
|
"vcvt.s32.f32 q1, q1\n"
|
|
"vcvt.s32.f32 q2, q2\n"
|
|
"vcvt.s32.f32 q3, q3\n"
|
|
|
|
"vst1.32 {d0, d1, d2, d3}, [%[output]]!\n"
|
|
"vst1.32 {d4, d5, d6}, [%[output]]!\n"
|
|
"vst1.32 {d7[0]}, [%[output]]!\n"
|
|
"pld [%[output]]\n"
|
|
"subs %[rows], %[rows], #1\n"
|
|
"bne 1b\n"
|
|
: [input] "+r"(input), [output] "+r"(output)
|
|
: [count] "r"(params.count), [rows] "r"(params_rows_copy),
|
|
[output_range_offset] "m"(params.output_range_offset),
|
|
[input_range_scale] "m"(params.input_range_scale),
|
|
[one_over_output_range_scale] "m"(params.one_over_output_range_scale),
|
|
[bias_range_min] "m"(params.bias_range_min),
|
|
[output_range_min] "m"(params.output_range_min),
|
|
[bias_range_scale] "m"(params.bias_range_scale),
|
|
[bias] "r"(params.bias), [input_range_min] "m"(params.input_range_min)
|
|
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9",
|
|
"d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
|
|
"d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
|
|
"cc", "memory");
|
|
}
|
|
|
|
} // namespace meta
|
|
} // namespace gemmlowp
|
|
|
|
#else
|
|
#warning "Meta gemm for arm32 requires: GEMMLOWP_NEON_32!"
|
|
#endif
|
|
|
|
#endif // GEMMLOWP_META_TRANSFORM_KERNELS_ARM_32_H_
|