// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #ifdef __APPLE__ #include #endif #include #include #include #include #include #include #include #include "../eight_bit_int_gemm/eight_bit_int_gemm.h" #include "test.h" #if defined(__arm__) && !defined(GEMMLOWP_NEON) #warning "Building without NEON support on ARM, check your compiler setup!" #endif double time() { #ifdef __APPLE__ timeval t; gettimeofday(&t, nullptr); return t.tv_sec + 1e-6 * t.tv_usec; #else timespec t; clock_gettime(CLOCK_REALTIME, &t); return t.tv_sec + 1e-9 * t.tv_nsec; #endif } const std::int32_t MIN_WORKING_SET_SIZE = 2 * 1024 * 1024; const double MIN_OPS = 1000.0 * 1000000.0; struct WorkingSet { WorkingSet() : lhs(nullptr), rhs(nullptr), result(nullptr) {} void init(std::int32_t n, std::int32_t m, std::int32_t k) { lhs = new std::uint8_t[n * k]; rhs = new std::uint8_t[k * m]; result = new std::uint8_t[m * n]; } std::uint8_t* lhs; std::uint8_t* rhs; std::uint8_t* result; }; struct Shape { std::int32_t n; std::int32_t m; std::int32_t k; std::int32_t repetitions; std::int32_t current_set; std::vector working_sets; Shape(std::int32_t n, std::int32_t m, std::int32_t k) : n(n), m(m), k(k), repetitions(1), current_set(0), working_sets() {} void init() { const std::int32_t size = n * k + k * m + n * m; const std::int32_t count = MIN_WORKING_SET_SIZE / size + 1; const double ops = static_cast(n) * static_cast(m) * static_cast(k); for (int i = 0; i < count; ++i) { working_sets.push_back(WorkingSet()); working_sets.back().init(n, m, k); } current_set = 0; repetitions = MIN_OPS / ops + 20; } WorkingSet& working_set() { return working_sets[current_set]; } void next_working_set() { current_set = (current_set + 1) % working_sets.size(); } }; double run_gemm(std::int32_t n, std::int32_t m, std::int32_t k, std::uint8_t* lhs, std::uint8_t* rhs, std::uint8_t* result) { gemmlowp::eight_bit_int_gemm::EightBitIntGemm( true, false, false, m, n, k, rhs, -100, k, lhs, -100, k, result, 10000, 10, 3, m, gemmlowp::eight_bit_int_gemm::BitDepthSetting::A8B8); return static_cast(n * m * k * 2); } double run_gemms(std::vector* shapes) { double ops = 0.0; for (auto& shape : *shapes) { ops += run_gemm(shape.n, shape.m, shape.k, shape.working_set().lhs, shape.working_set().rhs, shape.working_set().result); } return ops; } void print_summary(std::vector* times, bool full) { std::sort(times->begin(), times->end()); double sum_times = 0; double sum_times_trimmed = 0; int count_times_trimmed = 0; const float trim_ratio = 0.25; const size_t count_trimmed = times->size() * trim_ratio; double sum_times_best = 0; int count_times_best = 0; const float best_ratio = 0.1; const size_t count_best = times->size() * best_ratio; for (size_t i = 0; i < times->size(); i++) { sum_times += (*times)[i]; if (i >= count_trimmed && i < times->size() - count_trimmed) { sum_times_trimmed += (*times)[i]; count_times_trimmed++; } if (i < count_best) { sum_times_best += (*times)[i]; count_times_best++; } } const double min_latency = times->front(); const double max_latency = times->back(); const double mean_latency = sum_times / times->size(); const double trimmed_mean_latency = sum_times_trimmed / count_times_trimmed; const double best_mean_latency = sum_times_best / count_times_best; if (full) { std::cout << "Graph latency (over " << times->size() << " iterations):" << std::endl; std::cout << " Best: " << min_latency << "s" << std::endl; std::cout << " Worst: " << max_latency << "s" << std::endl; std::cout << " Mean: " << mean_latency << "s" << std::endl; std::cout << " " << 100 * trim_ratio << "% trimmed mean: " << trimmed_mean_latency << "s" << std::endl; std::cout << " Mean of " << 100 * best_ratio << "% best: " << best_mean_latency << "s" << std::endl; } else { std::cout << (mean_latency * 1000.0) << std::endl; } } void time_all(std::vector* shapes, std::int32_t repetitions, double max_time) { std::vector times; double ops = 0.0; double sum_time = 0.0; while (sum_time < max_time) { double start = time(); for (int i = 0; i < repetitions; ++i) { ops += run_gemms(shapes); } double delta_time = (time() - start); times.push_back(delta_time / repetitions); sum_time += delta_time; } print_summary(×, true); } void time_one(Shape* shape, double max_time) { std::vector times; double ops = 0.0; double sum_time = 0.0; std::cout << std::setprecision(6) << std::fixed << shape->n << ", " << shape->m << ", " << shape->k << ", " << std::flush; while (sum_time < max_time) { double start = time(); for (int i = 0; i < shape->repetitions; ++i) { ops += run_gemm(shape->n, shape->m, shape->k, shape->working_set().lhs, shape->working_set().rhs, shape->working_set().result); shape->next_working_set(); } double delta_time = (time() - start); times.push_back(delta_time / shape->repetitions); sum_time += delta_time; } print_summary(×, false); } int main() { std::vector googlenet_gemms; googlenet_gemms.push_back(Shape(12544, 64, 147)); googlenet_gemms.push_back(Shape(3136, 64, 64)); googlenet_gemms.push_back(Shape(3136, 192, 576)); googlenet_gemms.push_back(Shape(784, 64, 192)); googlenet_gemms.push_back(Shape(784, 96, 192)); googlenet_gemms.push_back(Shape(784, 128, 864)); googlenet_gemms.push_back(Shape(784, 16, 192)); googlenet_gemms.push_back(Shape(784, 32, 400)); googlenet_gemms.push_back(Shape(784, 32, 192)); googlenet_gemms.push_back(Shape(784, 128, 256)); googlenet_gemms.push_back(Shape(784, 128, 256)); googlenet_gemms.push_back(Shape(784, 192, 1152)); googlenet_gemms.push_back(Shape(784, 32, 256)); googlenet_gemms.push_back(Shape(784, 96, 800)); googlenet_gemms.push_back(Shape(784, 64, 256)); googlenet_gemms.push_back(Shape(196, 192, 480)); googlenet_gemms.push_back(Shape(196, 96, 480)); googlenet_gemms.push_back(Shape(196, 204, 864)); googlenet_gemms.push_back(Shape(196, 16, 480)); googlenet_gemms.push_back(Shape(196, 48, 400)); googlenet_gemms.push_back(Shape(196, 64, 480)); googlenet_gemms.push_back(Shape(196, 160, 508)); googlenet_gemms.push_back(Shape(196, 112, 508)); googlenet_gemms.push_back(Shape(196, 224, 1008)); googlenet_gemms.push_back(Shape(196, 24, 508)); googlenet_gemms.push_back(Shape(196, 64, 600)); googlenet_gemms.push_back(Shape(196, 64, 508)); googlenet_gemms.push_back(Shape(196, 128, 512)); googlenet_gemms.push_back(Shape(196, 128, 512)); googlenet_gemms.push_back(Shape(196, 256, 1152)); googlenet_gemms.push_back(Shape(196, 24, 512)); googlenet_gemms.push_back(Shape(196, 64, 600)); googlenet_gemms.push_back(Shape(196, 64, 512)); googlenet_gemms.push_back(Shape(196, 112, 512)); googlenet_gemms.push_back(Shape(196, 144, 512)); googlenet_gemms.push_back(Shape(196, 288, 1296)); googlenet_gemms.push_back(Shape(196, 32, 512)); googlenet_gemms.push_back(Shape(196, 64, 800)); googlenet_gemms.push_back(Shape(196, 64, 512)); googlenet_gemms.push_back(Shape(196, 256, 528)); googlenet_gemms.push_back(Shape(196, 160, 528)); googlenet_gemms.push_back(Shape(196, 320, 1440)); googlenet_gemms.push_back(Shape(196, 32, 528)); googlenet_gemms.push_back(Shape(196, 128, 800)); googlenet_gemms.push_back(Shape(196, 128, 528)); googlenet_gemms.push_back(Shape(49, 256, 832)); googlenet_gemms.push_back(Shape(49, 160, 832)); googlenet_gemms.push_back(Shape(49, 320, 1440)); googlenet_gemms.push_back(Shape(49, 48, 832)); googlenet_gemms.push_back(Shape(49, 128, 1200)); googlenet_gemms.push_back(Shape(49, 128, 832)); googlenet_gemms.push_back(Shape(49, 384, 832)); googlenet_gemms.push_back(Shape(49, 192, 832)); googlenet_gemms.push_back(Shape(49, 384, 1728)); googlenet_gemms.push_back(Shape(49, 48, 832)); googlenet_gemms.push_back(Shape(49, 128, 1200)); googlenet_gemms.push_back(Shape(49, 128, 832)); googlenet_gemms.push_back(Shape(16, 128, 508)); googlenet_gemms.push_back(Shape(1, 1024, 2048)); googlenet_gemms.push_back(Shape(1, 1008, 1024)); googlenet_gemms.push_back(Shape(16, 128, 528)); googlenet_gemms.push_back(Shape(1, 1024, 2048)); googlenet_gemms.push_back(Shape(1, 1008, 1024)); googlenet_gemms.push_back(Shape(1, 1008, 1024)); for (auto& shape : googlenet_gemms) { shape.init(); } std::vector small_gemms; small_gemms.push_back(Shape(29232, 16, 25)); small_gemms.push_back(Shape(7308, 6, 400)); small_gemms.push_back(Shape(203, 3002, 216)); for (auto& shape : small_gemms) { shape.init(); } std::vector others; others.push_back(Shape(100, 100, 100)); others.push_back(Shape(1000, 1000, 1000)); others.push_back(Shape(2000, 1000, 1000)); for (auto& shape : others) { shape.init(); } std::vector lstm; lstm.push_back(Shape(1, 500, 320)); lstm.push_back(Shape(1, 100, 500)); lstm.push_back(Shape(1, 500, 500)); lstm.push_back(Shape(1, 500, 100)); lstm.push_back(Shape(1, 2000, 100)); for (auto& shape : lstm) { shape.init(); } gemmlowp::eight_bit_int_gemm::SetMaxNumThreads(4); std::cout << "Warmup run." << std::endl; time_all(&googlenet_gemms, 10, 1.0); time_all(&small_gemms, 50, 1.0); std::cout << "Timing all." << std::endl; time_all(&googlenet_gemms, 10, 10.0); time_all(&small_gemms, 50, 10.0); std::cout << "Timing separate." << std::endl; for (auto& shape : googlenet_gemms) { time_one(&shape, 0.10); } for (auto& shape : small_gemms) { time_one(&shape, 0.10); } for (auto& shape : others) { time_one(&shape, 0.10); } for (auto& shape : lstm) { time_one(&shape, 0.10); } return 0; }