You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
152 lines
5.0 KiB
152 lines
5.0 KiB
4 months ago
|
// Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
#include <unistd.h>
|
||
|
#ifdef __APPLE__
|
||
|
#include <sys/time.h>
|
||
|
#endif
|
||
|
|
||
|
#include <cstdint>
|
||
|
#include <cstdlib>
|
||
|
#include <ctime>
|
||
|
#include <iomanip>
|
||
|
#include <iostream>
|
||
|
#include <map>
|
||
|
#include <memory>
|
||
|
#include <vector>
|
||
|
|
||
|
#include "multi_thread_transform.h"
|
||
|
#include "transform_kernels.h"
|
||
|
|
||
|
using namespace gemmlowp::meta;
|
||
|
|
||
|
double time() {
|
||
|
#ifdef __APPLE__
|
||
|
timeval t;
|
||
|
gettimeofday(&t, nullptr);
|
||
|
return t.tv_sec + 1e-6 * t.tv_usec;
|
||
|
#else
|
||
|
timespec t;
|
||
|
clock_gettime(CLOCK_REALTIME, &t);
|
||
|
return t.tv_sec + 1e-9 * t.tv_nsec;
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
#define kernel_size (16)
|
||
|
|
||
|
template <typename Context, typename Params>
|
||
|
void run_benchmark(const std::string& name, int repetitions, int elements,
|
||
|
Context* context, const Params& params) {
|
||
|
std::cout << "Benchmark: " << name << std::endl;
|
||
|
std::cout << "Warmup single." << std::endl;
|
||
|
|
||
|
for (int i = 0; i < 10; ++i) {
|
||
|
Transform1D<Params, kernel_size>(params);
|
||
|
}
|
||
|
|
||
|
std::cout << "Benchmark single." << std::endl;
|
||
|
|
||
|
double start = time();
|
||
|
|
||
|
for (int i = 0; i < repetitions; ++i) {
|
||
|
Transform1D<Params, kernel_size>(params);
|
||
|
}
|
||
|
|
||
|
double wall_time = time() - start;
|
||
|
double ops = static_cast<double>(elements) * repetitions;
|
||
|
std::cout << "Avg: " << (wall_time / repetitions) << std::endl;
|
||
|
std::cout << "Perf: " << static_cast<std::int64_t>(ops / wall_time) << "/s."
|
||
|
<< std::endl;
|
||
|
|
||
|
std::cout << "Warmup single." << std::endl;
|
||
|
|
||
|
for (int i = 0; i < 10; ++i) {
|
||
|
MultiThreadTransform1D<Context, Params, kernel_size>(context, params);
|
||
|
}
|
||
|
|
||
|
std::cout << "Benchmark multi." << std::endl;
|
||
|
|
||
|
start = time();
|
||
|
|
||
|
for (int i = 0; i < repetitions; ++i) {
|
||
|
MultiThreadTransform1D<Context, Params, kernel_size>(context, params);
|
||
|
}
|
||
|
|
||
|
wall_time = time() - start;
|
||
|
ops = static_cast<double>(elements) * repetitions;
|
||
|
std::cout << "Avg: " << (wall_time / repetitions) << std::endl;
|
||
|
std::cout << "Perf: " << static_cast<std::int64_t>(ops / wall_time) << "/s."
|
||
|
<< std::endl;
|
||
|
}
|
||
|
|
||
|
int main() {
|
||
|
const int repetitions = 500;
|
||
|
const int elements = 4 * 1024 * 1024;
|
||
|
|
||
|
std::unique_ptr<std::int32_t[]> int32_array(new std::int32_t[elements]);
|
||
|
std::unique_ptr<std::uint8_t[]> uint8_array(new std::uint8_t[elements]);
|
||
|
std::unique_ptr<float[]> float_array(new float[elements]);
|
||
|
|
||
|
typedef SimpleContext<gemmlowp::WorkersPool> Context;
|
||
|
Context context(4, new gemmlowp::WorkersPool());
|
||
|
|
||
|
typedef Transform1DParams<std::int32_t, std::uint8_t, Requantize> RequantizeParams;
|
||
|
RequantizeParams requantize_params;
|
||
|
requantize_params.input = int32_array.get();
|
||
|
requantize_params.output = uint8_array.get();
|
||
|
requantize_params.kernel.count = elements;
|
||
|
requantize_params.kernel.input_range_min = -100.0f;
|
||
|
requantize_params.kernel.input_range_scale =
|
||
|
200.0f / ((static_cast<std::int64_t>(1) << 32) - 1);
|
||
|
requantize_params.kernel.input_range_offset =
|
||
|
static_cast<float>(std::numeric_limits<std::int32_t>::lowest());
|
||
|
requantize_params.kernel.output_range_min = -200.0f;
|
||
|
requantize_params.kernel.one_over_output_range_scale =
|
||
|
static_cast<float>((static_cast<std::int64_t>(1) << 8) - 1) / 500.0f;
|
||
|
requantize_params.kernel.output_range_offset =
|
||
|
static_cast<float>(std::numeric_limits<std::uint8_t>::lowest());
|
||
|
|
||
|
run_benchmark("Requantize", repetitions, elements, &context,
|
||
|
requantize_params);
|
||
|
|
||
|
typedef Transform1DParams<std::uint8_t, float, Dequantize> DequantizeParams;
|
||
|
DequantizeParams dequantize_params;
|
||
|
dequantize_params.input = uint8_array.get();
|
||
|
dequantize_params.output = float_array.get();
|
||
|
dequantize_params.kernel.count = elements;
|
||
|
dequantize_params.kernel.range_min = -100.0f;
|
||
|
dequantize_params.kernel.range_scale =
|
||
|
static_cast<float>((static_cast<std::int64_t>(1) << 8) - 1) / 200.0f;
|
||
|
dequantize_params.kernel.range_offset =
|
||
|
static_cast<float>(std::numeric_limits<std::uint8_t>::lowest());
|
||
|
|
||
|
run_benchmark("Dequantize", repetitions, elements, &context,
|
||
|
dequantize_params);
|
||
|
|
||
|
typedef Transform1DParams<float, std::uint8_t, Quantize> QuantizeParams;
|
||
|
QuantizeParams quantize_params;
|
||
|
quantize_params.input = float_array.get();
|
||
|
quantize_params.output = uint8_array.get();
|
||
|
quantize_params.kernel.count = elements;
|
||
|
quantize_params.kernel.range_min = -100.0f;
|
||
|
quantize_params.kernel.range_scale =
|
||
|
200.0f / ((static_cast<std::int64_t>(1) << 8) - 1);
|
||
|
quantize_params.kernel.range_offset =
|
||
|
static_cast<float>(std::numeric_limits<std::uint8_t>::lowest());
|
||
|
|
||
|
run_benchmark("Quantize", repetitions, elements, &context, quantize_params);
|
||
|
|
||
|
return 0;
|
||
|
}
|