You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
371 lines
12 KiB
371 lines
12 KiB
// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#ifdef __APPLE__
|
|
#include <sys/time.h>
|
|
#endif
|
|
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#include <ctime>
|
|
#include <iostream>
|
|
#include <map>
|
|
#include <vector>
|
|
#ifdef __APPLE__
|
|
#include <TargetConditionals.h>
|
|
#endif
|
|
|
|
#include "test.h"
|
|
|
|
#ifndef GEMMLOWP_TEST_BIT_DEPTH_PARAMS
|
|
#define GEMMLOWP_TEST_BIT_DEPTH_PARAMS DefaultL8R8BitDepthParams
|
|
#endif
|
|
|
|
#if defined(__arm__) && !defined(GEMMLOWP_NEON)
|
|
#warning "Building without NEON support on ARM, check your compiler setup!"
|
|
#endif
|
|
|
|
#if defined(__mips) && !defined(GEMMLOWP_MSA)
|
|
#warning "Building without MSA support on MIPS, check your compiler setup!"
|
|
#endif
|
|
|
|
#if defined(__AVX2__) && !defined(GEMMLOWP_AVX2)
|
|
#warning \
|
|
"Building without AVX2 support on AVX2 enabled machine, check your compiler setup!"
|
|
#endif
|
|
|
|
#if defined(__SSE4_2__) && !defined(GEMMLOWP_AVX2) && !defined(GEMMLOWP_SSE4)
|
|
#warning \
|
|
"Building without SSE4.2 support on SSE4.2 enabled machine, check your compiler setup!"
|
|
#endif
|
|
|
|
namespace gemmlowp {
|
|
|
|
const double min_accurate_duration = 1e-1;
|
|
const std::size_t min_working_set_size = 16 * 1024 * 1024;
|
|
|
|
struct gemm_t {
|
|
int rows, depth, cols;
|
|
gemm_t() : rows(0), depth(0), cols(0) {}
|
|
gemm_t(int r, int d, int c) : rows(r), depth(d), cols(c) {}
|
|
};
|
|
|
|
bool operator<(const gemm_t& a, const gemm_t& b) {
|
|
return a.rows < b.rows ||
|
|
(a.rows <= b.rows &&
|
|
(a.depth < b.depth || (a.depth <= b.depth && (a.cols < b.cols))));
|
|
}
|
|
|
|
template <typename LhsType, typename RhsType, typename ResultType>
|
|
double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
|
|
typedef std::uint8_t Scalar;
|
|
|
|
// set up the matrix pool
|
|
|
|
std::size_t combined_gemm_sizes = 0;
|
|
for (auto gemm : gemms) {
|
|
int rows = gemm.rows;
|
|
int depth = gemm.depth;
|
|
int cols = gemm.cols;
|
|
combined_gemm_sizes +=
|
|
sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
|
|
}
|
|
|
|
const std::size_t pool_size = 1 + min_working_set_size / combined_gemm_sizes;
|
|
|
|
std::vector<LhsType> lhs(pool_size * gemms.size());
|
|
std::vector<RhsType> rhs(pool_size * gemms.size());
|
|
std::vector<ResultType> result(pool_size * gemms.size());
|
|
|
|
for (std::size_t i = 0; i < pool_size; i++) {
|
|
for (std::size_t j = 0; j < gemms.size(); j++) {
|
|
int k = i * gemms.size() + j;
|
|
lhs[k].Resize(gemms[j].rows, gemms[j].depth);
|
|
MakeConstant(&lhs[k], 0);
|
|
rhs[k].Resize(gemms[j].depth, gemms[j].cols);
|
|
MakeConstant(&rhs[k], 0);
|
|
result[k].Resize(gemms[j].rows, gemms[j].cols);
|
|
MakeConstant(&result[k], 0);
|
|
}
|
|
}
|
|
|
|
// main benchmark loop
|
|
|
|
int iters_at_a_time = 1;
|
|
float time_per_iter = 0.0f;
|
|
std::size_t pool_index = 0;
|
|
|
|
while (true) {
|
|
double starttime = real_time_in_seconds();
|
|
for (int i = 0; i < iters_at_a_time; i++) {
|
|
for (size_t j = 0; j < gemms.size(); j++) {
|
|
size_t k = pool_index * gemms.size() + j;
|
|
Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
|
|
context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
|
|
-75, -91, 74980, 123, 20);
|
|
}
|
|
pool_index++;
|
|
if (pool_index == pool_size) {
|
|
pool_index = 0;
|
|
}
|
|
}
|
|
double endtime = real_time_in_seconds();
|
|
|
|
const float timing = static_cast<float>(endtime - starttime);
|
|
|
|
if (timing >= min_accurate_duration) {
|
|
time_per_iter = timing / iters_at_a_time;
|
|
break;
|
|
}
|
|
|
|
iters_at_a_time *= 2;
|
|
}
|
|
|
|
return time_per_iter;
|
|
}
|
|
|
|
template <typename LhsType, typename RhsType, typename ResultType>
|
|
double gflops_for_gemms(GemmContext* context,
|
|
const std::vector<gemm_t>& gemms) {
|
|
const double time_per_iter =
|
|
time_for_gemms<LhsType, RhsType, ResultType>(context, gemms);
|
|
double ops = 0;
|
|
for (auto gemm : gemms) {
|
|
ops += 2.0 * gemm.rows * gemm.depth * gemm.cols;
|
|
}
|
|
return 1e-9 * ops / time_per_iter;
|
|
}
|
|
|
|
void benchmark(GemmContext* context) {
|
|
std::map<gemm_t, std::vector<double>> benchmark_results;
|
|
|
|
std::vector<gemm_t> benchmark_gemms;
|
|
benchmark_gemms.emplace_back(10, 10, 10);
|
|
benchmark_gemms.emplace_back(20, 20, 20);
|
|
benchmark_gemms.emplace_back(30, 30, 30);
|
|
benchmark_gemms.emplace_back(40, 40, 40);
|
|
benchmark_gemms.emplace_back(50, 50, 50);
|
|
benchmark_gemms.emplace_back(60, 60, 60);
|
|
benchmark_gemms.emplace_back(64, 256, 147);
|
|
benchmark_gemms.emplace_back(100, 100, 1);
|
|
benchmark_gemms.emplace_back(100, 100, 100);
|
|
benchmark_gemms.emplace_back(100, 1000, 100);
|
|
benchmark_gemms.emplace_back(1000, 1000, 1);
|
|
benchmark_gemms.emplace_back(1000, 1000, 10);
|
|
benchmark_gemms.emplace_back(1000, 1000, 100);
|
|
benchmark_gemms.emplace_back(1000, 1000, 1000);
|
|
|
|
const int repeat = 2;
|
|
|
|
typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
|
|
typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
|
|
typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
|
|
|
|
#ifdef GEMMLOWP_TEST_PROFILE
|
|
gemmlowp::RegisterCurrentThreadForProfiling();
|
|
gemmlowp::StartProfiling();
|
|
#endif
|
|
|
|
// We don't record the first repetition, it's just warm-up.
|
|
for (int r = 0; r < repeat + 1; r++) {
|
|
std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
|
|
<< std::flush;
|
|
for (auto gemm : benchmark_gemms) {
|
|
double gflops = 0;
|
|
std::vector<gemm_t> unique_gemm;
|
|
unique_gemm.push_back(gemm);
|
|
gflops =
|
|
gflops_for_gemms<LhsType, RhsType, ResultType>(context, unique_gemm);
|
|
if (r > 0) {
|
|
benchmark_results[gemm].emplace_back(gflops);
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef GEMMLOWP_TEST_PROFILE
|
|
gemmlowp::FinishProfiling();
|
|
#endif
|
|
|
|
std::cout << " \r"
|
|
<< std::flush;
|
|
|
|
std::cout.precision(4);
|
|
|
|
for (auto b : benchmark_results) {
|
|
sort(b.second.begin(), b.second.end());
|
|
std::cout << b.first.rows << "x" << b.first.depth << "x" << b.first.cols
|
|
<< " : " << b.second.back() << " GFlops/s" << std::endl;
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
void benchmark_gemm_sizes(GemmContext* context,
|
|
const std::vector<gemm_t>& gemms, double mintime) {
|
|
typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
|
|
typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
|
|
typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
|
|
|
|
std::vector<float> gemm_times;
|
|
std::cout << "running for " << mintime << " seconds..." << std::endl;
|
|
|
|
#ifdef GEMMLOWP_TEST_PROFILE
|
|
gemmlowp::RegisterCurrentThreadForProfiling();
|
|
gemmlowp::StartProfiling();
|
|
#endif
|
|
|
|
double starttime = real_time_in_seconds();
|
|
while (real_time_in_seconds() < starttime + mintime) {
|
|
gemm_times.push_back(
|
|
time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
|
|
}
|
|
|
|
#ifdef GEMMLOWP_TEST_PROFILE
|
|
gemmlowp::FinishProfiling();
|
|
#endif
|
|
|
|
std::sort(gemm_times.begin(), gemm_times.end());
|
|
|
|
double sum_gemm_times = 0;
|
|
double sum_gemm_times_trimmed = 0;
|
|
int count_gemm_times_trimmed = 0;
|
|
const float trim_ratio = 0.25;
|
|
const size_t count_trimmed = gemm_times.size() * trim_ratio;
|
|
double sum_gemm_times_best = 0;
|
|
int count_gemm_times_best = 0;
|
|
const float best_ratio = 0.1;
|
|
const size_t count_best = gemm_times.size() * best_ratio;
|
|
|
|
for (size_t i = 0; i < gemm_times.size(); i++) {
|
|
sum_gemm_times += gemm_times[i];
|
|
if (i >= count_trimmed && i < gemm_times.size() - count_trimmed) {
|
|
sum_gemm_times_trimmed += gemm_times[i];
|
|
count_gemm_times_trimmed++;
|
|
}
|
|
if (i < count_best) {
|
|
sum_gemm_times_best += gemm_times[i];
|
|
count_gemm_times_best++;
|
|
}
|
|
}
|
|
|
|
const double min_latency = gemm_times.front();
|
|
const double max_latency = gemm_times.back();
|
|
const double mean_latency = sum_gemm_times / gemm_times.size();
|
|
const double trimmed_mean_latency =
|
|
sum_gemm_times_trimmed / count_gemm_times_trimmed;
|
|
const double best_mean_latency = sum_gemm_times_best / count_gemm_times_best;
|
|
|
|
std::cout << "Graph latency (over " << gemm_times.size()
|
|
<< " iterations):" << std::endl;
|
|
std::cout << " Best: " << min_latency << "s" << std::endl;
|
|
std::cout << " Worst: " << max_latency << "s" << std::endl;
|
|
std::cout << " Mean: " << mean_latency << "s" << std::endl;
|
|
std::cout << " " << 100 * trim_ratio
|
|
<< "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl;
|
|
std::cout << " Mean of " << 100 * best_ratio
|
|
<< "% best: " << best_mean_latency << "s" << std::endl;
|
|
}
|
|
|
|
void benchmark_googlenet(GemmContext* context) {
|
|
// These are the m, n, k sizes for a typical GoogLeNet.
|
|
const int googlenet_gemm_sizes[] = {
|
|
12544, 64, 147, 3136, 64, 64, 3136, 192, 576, 784, 64, 192,
|
|
784, 96, 192, 784, 128, 864, 784, 16, 192, 784, 32, 400,
|
|
784, 32, 192, 784, 128, 256, 784, 128, 256, 784, 192, 1152,
|
|
784, 32, 256, 784, 96, 800, 784, 64, 256, 196, 192, 480,
|
|
196, 96, 480, 196, 204, 864, 196, 16, 480, 196, 48, 400,
|
|
196, 64, 480, 196, 160, 508, 196, 112, 508, 196, 224, 1008,
|
|
196, 24, 508, 196, 64, 600, 196, 64, 508, 196, 128, 512,
|
|
196, 128, 512, 196, 256, 1152, 196, 24, 512, 196, 64, 600,
|
|
196, 64, 512, 196, 112, 512, 196, 144, 512, 196, 288, 1296,
|
|
196, 32, 512, 196, 64, 800, 196, 64, 512, 196, 256, 528,
|
|
196, 160, 528, 196, 320, 1440, 196, 32, 528, 196, 128, 800,
|
|
196, 128, 528, 49, 256, 832, 49, 160, 832, 49, 320, 1440,
|
|
49, 48, 832, 49, 128, 1200, 49, 128, 832, 49, 384, 832,
|
|
49, 192, 832, 49, 384, 1728, 49, 48, 832, 49, 128, 1200,
|
|
49, 128, 832, 16, 128, 508, 1, 1024, 2048, 1, 1008, 1024,
|
|
16, 128, 528, 1, 1024, 2048, 1, 1008, 1024, 1, 1008, 1024,
|
|
};
|
|
assert(sizeof(googlenet_gemm_sizes) % (3 * sizeof(googlenet_gemm_sizes[0])) ==
|
|
0);
|
|
const std::size_t num_googlenet_gemms =
|
|
sizeof(googlenet_gemm_sizes) / (3 * sizeof(googlenet_gemm_sizes[0]));
|
|
|
|
std::vector<gemm_t> googlenet_gemms(num_googlenet_gemms);
|
|
for (std::size_t i = 0; i < num_googlenet_gemms; i++) {
|
|
googlenet_gemms[i].rows = googlenet_gemm_sizes[3 * i + 1];
|
|
googlenet_gemms[i].depth = googlenet_gemm_sizes[3 * i + 2];
|
|
googlenet_gemms[i].cols = googlenet_gemm_sizes[3 * i + 0];
|
|
}
|
|
|
|
const double mintime = 20.0;
|
|
benchmark_gemm_sizes(context, googlenet_gemms, mintime);
|
|
}
|
|
|
|
void benchmark_small_model(GemmContext* context) {
|
|
// These are the m, n, k sizes for a small model with large batches.
|
|
const int small_model_gemm_sizes[] = {
|
|
29232, 16, 25, 7308, 6, 400, 203, 3002, 216,
|
|
};
|
|
assert(sizeof(small_model_gemm_sizes) %
|
|
(3 * sizeof(small_model_gemm_sizes[0])) ==
|
|
0);
|
|
const std::size_t num_small_model_gemms =
|
|
sizeof(small_model_gemm_sizes) / (3 * sizeof(small_model_gemm_sizes[0]));
|
|
|
|
std::vector<gemm_t> small_model_gemms(num_small_model_gemms);
|
|
for (std::size_t i = 0; i < num_small_model_gemms; i++) {
|
|
small_model_gemms[i].rows = small_model_gemm_sizes[3 * i + 1];
|
|
small_model_gemms[i].depth = small_model_gemm_sizes[3 * i + 2];
|
|
small_model_gemms[i].cols = small_model_gemm_sizes[3 * i + 0];
|
|
}
|
|
|
|
const double mintime = 10.0;
|
|
benchmark_gemm_sizes(context, small_model_gemms, mintime);
|
|
}
|
|
|
|
void benchmark_all() {
|
|
{
|
|
gemmlowp::GemmContext context;
|
|
std::cout << "Benchmarking small model GEMMs..." << std::endl;
|
|
gemmlowp::benchmark_small_model(&context);
|
|
}
|
|
|
|
{
|
|
gemmlowp::GemmContext context;
|
|
std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
|
|
gemmlowp::benchmark_googlenet(&context);
|
|
}
|
|
|
|
{
|
|
gemmlowp::GemmContext context;
|
|
context.set_max_num_threads(0);
|
|
std::cout << "Benchmarking multi-threaded mode..." << std::endl;
|
|
gemmlowp::benchmark(&context);
|
|
}
|
|
|
|
{
|
|
gemmlowp::GemmContext context;
|
|
context.set_max_num_threads(1);
|
|
std::cout << "Benchmarking single-threaded mode..." << std::endl;
|
|
gemmlowp::benchmark(&context);
|
|
}
|
|
}
|
|
|
|
} // end namespace gemmlowp
|
|
|
|
// For iOS, we need to define our own main(), so skip it here.
|
|
#if !(defined(__APPLE__) && (TARGET_OS_IPHONE || TARGET_IPHONE_SIMULATOR))
|
|
int main() { gemmlowp::benchmark_all(); }
|
|
#endif
|