You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
374 lines
12 KiB
374 lines
12 KiB
// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// profiler.h: a simple sampling profiler that's always just one #include away!
|
|
//
|
|
// Overview
|
|
// ========
|
|
//
|
|
// This profiler only samples a pseudo-stack, not the actual call stack.
|
|
// The code to be profiled needs to be instrumented with
|
|
// pseudo-stack "labels", see ScopedProfilingLabel.
|
|
// Using pseudo-stacks allows this profiler to be very simple, low-overhead,
|
|
// portable, and independent of compilation details such as function inlining
|
|
// and frame pointers. The granularity of instrumentation can be freely chosen,
|
|
// and it is possible to get some annotate-like detail, i.e. detail within one
|
|
// function without splitting it into multiple functions.
|
|
//
|
|
// This profiler should remain small and simple; its key feature is to fit in
|
|
// a single header file so that there should never be a reason to refrain
|
|
// from profiling. More complex and feature-rich alternatives are
|
|
// readily available. This one offers a strict superset of its
|
|
// functionality: https://github.com/bgirard/GeckoProfiler, including
|
|
// intertwining pseudostacks with real call stacks, more annotation options,
|
|
// and advanced visualization.
|
|
//
|
|
// Usage
|
|
// =====
|
|
//
|
|
// 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is
|
|
// not enabled, profiling instrumentation from instrumentation.h
|
|
// (ScopedProfilingLabel, RegisterCurrentThreadForProfiling)
|
|
// is still defined but does nothing. On the other hand,
|
|
// when profiling is not enabled, it is an error to #include the
|
|
// present file.
|
|
//
|
|
// 1. Each thread can opt in to profiling by calling
|
|
// RegisterCurrentThreadForProfiling() defined in instrumentation.h.
|
|
// This can be done at any time, before or during profiling.
|
|
// No sample will be collected from a thread until
|
|
// it has called RegisterCurrentThreadForProfiling().
|
|
//
|
|
// 2. Instrument your code to be profiled with ScopedProfilingLabel,
|
|
// which is a RAII helper defined in instrumentation.h. The identifier
|
|
// names (some_label, etc) do not matter; what will show up
|
|
// in the profile is the string passed to the constructor, which
|
|
// must be a literal string. See the full example below.
|
|
//
|
|
// Note: the overhead of ScopedProfilingLabel is zero when not
|
|
// enabling profiling (when not defining GEMMLOWP_PROFILING).
|
|
//
|
|
// 3. Use the profiler.h interface to control profiling. There are two
|
|
// functions: StartProfiling() and FinishProfiling(). They must be
|
|
// called on the same thread. FinishProfiling() prints the profile
|
|
// on stdout.
|
|
//
|
|
// Full example
|
|
// ============
|
|
/*
|
|
#define GEMMLOWP_PROFILING
|
|
#include "profiling/instrumentation.h"
|
|
using namespace gemmlowp;
|
|
|
|
const int iters = 100000000;
|
|
volatile int i;
|
|
|
|
void Bar() {
|
|
ScopedProfilingLabel label("Bar");
|
|
for (i = 0; i < iters; i++) {}
|
|
}
|
|
|
|
void Foo() {
|
|
ScopedProfilingLabel label("Foo");
|
|
for (i = 0; i < iters; i++) {}
|
|
Bar();
|
|
}
|
|
|
|
void Init() {
|
|
RegisterCurrentThreadForProfiling();
|
|
}
|
|
|
|
#include "profiling/profiler.h"
|
|
|
|
int main() {
|
|
Init();
|
|
StartProfiling();
|
|
Foo();
|
|
FinishProfiling();
|
|
}
|
|
*
|
|
* Output:
|
|
*
|
|
gemmlowp profile (1 threads, 304 samples)
|
|
100.00% Foo
|
|
51.32% other
|
|
48.68% Bar
|
|
0.00% other (outside of any label)
|
|
*/
|
|
//
|
|
// Interpreting results
|
|
// ====================
|
|
//
|
|
// Each node shows the absolute percentage, among all the samples,
|
|
// of the number of samples that recorded the given pseudo-stack.
|
|
// The percentages are *NOT* relative to the parent node. In addition
|
|
// to your own labels, you will also see 'other' nodes that collect
|
|
// the remainder of samples under the parent node that didn't fall into
|
|
// any of the labelled child nodes. Example:
|
|
//
|
|
// 20% Foo
|
|
// 12% Bar
|
|
// 6% Xyz
|
|
// 2% other
|
|
//
|
|
// This means that 20% of all labels were under Foo, of which 12%/20%==60%
|
|
// were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not
|
|
// under either Bar or Xyz.
|
|
//
|
|
// Typically, one wants to keep adding ScopedProfilingLabel's until
|
|
// the 'other' nodes show low percentages.
|
|
//
|
|
// Interpreting results with multiple threads
|
|
// ==========================================
|
|
//
|
|
// At each sample, each thread registered for profiling gets sampled once.
|
|
// So if there is one "main thread" spending its time in MainFunc() and
|
|
// 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the
|
|
// samples will be in WorkerFunc, so the profile will look like this:
|
|
//
|
|
// 80% WorkerFunc
|
|
// 20% MainFunc
|
|
|
|
#ifndef GEMMLOWP_PROFILING_PROFILER_H_
|
|
#define GEMMLOWP_PROFILING_PROFILER_H_
|
|
|
|
#ifndef GEMMLOWP_PROFILING
|
|
#error Profiling is not enabled!
|
|
#endif
|
|
|
|
#include <vector>
|
|
|
|
#include "instrumentation.h"
|
|
|
|
namespace gemmlowp {
|
|
|
|
// A tree view of a profile.
|
|
class ProfileTreeView {
|
|
struct Node {
|
|
std::vector<Node*> children;
|
|
const char* label;
|
|
std::size_t weight;
|
|
Node() : label(nullptr), weight(0) {}
|
|
~Node() {
|
|
for (auto child : children) {
|
|
delete child;
|
|
}
|
|
}
|
|
};
|
|
|
|
static bool CompareNodes(Node* n1, Node* n2) {
|
|
return n1->weight > n2->weight;
|
|
}
|
|
|
|
Node root_;
|
|
|
|
void PrintNode(const Node* node, int level) const {
|
|
if (level) {
|
|
for (int i = 1; i < level; i++) {
|
|
printf(" ");
|
|
}
|
|
printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label);
|
|
}
|
|
for (auto child : node->children) {
|
|
PrintNode(child, level + 1);
|
|
}
|
|
}
|
|
|
|
static void AddStackToNode(const ProfilingStack& stack, Node* node,
|
|
std::size_t level) {
|
|
node->weight++;
|
|
if (stack.size == level) {
|
|
return;
|
|
}
|
|
Node* child_to_add_to = nullptr;
|
|
for (auto child : node->children) {
|
|
if (child->label == stack.labels[level]) {
|
|
child_to_add_to = child;
|
|
break;
|
|
}
|
|
}
|
|
if (!child_to_add_to) {
|
|
child_to_add_to = new Node;
|
|
child_to_add_to->label = stack.labels[level];
|
|
node->children.push_back(child_to_add_to);
|
|
}
|
|
AddStackToNode(stack, child_to_add_to, level + 1);
|
|
return;
|
|
}
|
|
|
|
void AddStack(const ProfilingStack& stack) {
|
|
AddStackToNode(stack, &root_, 0);
|
|
}
|
|
|
|
void AddOtherChildrenToNode(Node* node) {
|
|
std::size_t top_level_children_weight = 0;
|
|
for (auto c : node->children) {
|
|
AddOtherChildrenToNode(c);
|
|
top_level_children_weight += c->weight;
|
|
}
|
|
if (top_level_children_weight) {
|
|
Node* other_child = new Node;
|
|
other_child->label =
|
|
node == &root_ ? "other (outside of any label)" : "other";
|
|
other_child->weight = node->weight - top_level_children_weight;
|
|
node->children.push_back(other_child);
|
|
}
|
|
}
|
|
|
|
void AddOtherNodes() { AddOtherChildrenToNode(&root_); }
|
|
|
|
void SortNode(Node* node) {
|
|
std::sort(node->children.begin(), node->children.end(), CompareNodes);
|
|
for (auto child : node->children) {
|
|
SortNode(child);
|
|
}
|
|
}
|
|
|
|
void Sort() { SortNode(&root_); }
|
|
|
|
public:
|
|
explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) {
|
|
for (auto stack : stacks) {
|
|
AddStack(stack);
|
|
}
|
|
AddOtherNodes();
|
|
Sort();
|
|
}
|
|
|
|
void Print() const {
|
|
printf("\n");
|
|
printf("gemmlowp profile (%d threads, %d samples)\n",
|
|
static_cast<int>(ThreadsUnderProfiling().size()),
|
|
static_cast<int>(root_.weight));
|
|
PrintNode(&root_, 0);
|
|
printf("\n");
|
|
}
|
|
};
|
|
|
|
// This function is the only place that determines our sampling frequency.
|
|
inline void WaitOneProfilerTick() {
|
|
static const int millisecond = 1000000;
|
|
|
|
#if defined __arm__ || defined __aarch64__
|
|
// Reduced sampling frequency on mobile devices helps limit time and memory
|
|
// overhead there.
|
|
static const int interval = 10 * millisecond;
|
|
#else
|
|
static const int interval = 1 * millisecond;
|
|
#endif
|
|
|
|
timespec ts;
|
|
ts.tv_sec = 0;
|
|
ts.tv_nsec = interval;
|
|
nanosleep(&ts, nullptr);
|
|
}
|
|
|
|
// This is how we track whether we've already started profiling,
|
|
// to guard against misuse of the API.
|
|
inline bool& IsProfiling() {
|
|
static bool b;
|
|
return b;
|
|
}
|
|
|
|
// This is how we tell the profiler thread to finish.
|
|
inline bool& ProfilerThreadShouldFinish() {
|
|
static bool b;
|
|
return b;
|
|
}
|
|
|
|
// The profiler thread. See ProfilerThreadFunc.
|
|
inline pthread_t& ProfilerThread() {
|
|
static pthread_t t;
|
|
return t;
|
|
}
|
|
|
|
// Records a stack from a running thread.
|
|
// The tricky part is that we're not interrupting the thread.
|
|
// This is OK because we're looking at a pseudo-stack of labels,
|
|
// not at the real thread stack, and if the pseudo-stack changes
|
|
// while we're recording it, we are OK with getting either the
|
|
// old or the new stack. Note that ProfilingStack::Pop
|
|
// only decrements the size, and doesn't null the popped label,
|
|
// so if we're concurrently recording it, it shouldn't change
|
|
// under our feet until another label is pushed, at which point
|
|
// we are OK with getting either this new label or the old one.
|
|
// In the end, the key atomicity property that we are relying on
|
|
// here is that pointers are changed atomically, and the labels
|
|
// are pointers (to literal strings).
|
|
inline void RecordStack(ThreadInfo* thread, ProfilingStack* dst) {
|
|
ScopedLock sl(thread->stack.lock);
|
|
assert(!dst->size);
|
|
while (dst->size < thread->stack.size) {
|
|
dst->labels[dst->size] = thread->stack.labels[dst->size];
|
|
dst->size++;
|
|
}
|
|
}
|
|
|
|
// The profiler thread's entry point.
|
|
// Note that a separate thread is to be started each time we call
|
|
// StartProfiling(), and finishes when we call FinishProfiling().
|
|
// So here we only need to handle the recording and reporting of
|
|
// a single profile.
|
|
inline void* ProfilerThreadFunc(void*) {
|
|
assert(ProfilerThread() == pthread_self());
|
|
|
|
// Since we only handle one profile per profiler thread, the
|
|
// profile data (the array of recorded stacks) can be a local variable here.
|
|
std::vector<ProfilingStack> stacks;
|
|
|
|
while (!ProfilerThreadShouldFinish()) {
|
|
WaitOneProfilerTick();
|
|
{
|
|
ScopedLock sl(GlobalMutexes::Profiler());
|
|
for (auto t : ThreadsUnderProfiling()) {
|
|
ProfilingStack s;
|
|
RecordStack(t, &s);
|
|
stacks.push_back(s);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Profiling is finished and we now report the results.
|
|
ProfileTreeView(stacks).Print();
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
// Starts recording samples.
|
|
inline void StartProfiling() {
|
|
ScopedLock sl(GlobalMutexes::Profiler());
|
|
ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!");
|
|
IsProfiling() = true;
|
|
ProfilerThreadShouldFinish() = false;
|
|
pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr);
|
|
}
|
|
|
|
// Stops recording samples, and prints a profile tree-view on stdout.
|
|
inline void FinishProfiling() {
|
|
{
|
|
ScopedLock sl(GlobalMutexes::Profiler());
|
|
ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!");
|
|
// The ProfilerThreadShouldFinish() mechanism here is really naive and bad,
|
|
// as the scary comments below should make clear.
|
|
// Should we use a condition variable?
|
|
ProfilerThreadShouldFinish() = true;
|
|
} // must release the lock here to avoid deadlock with profiler thread.
|
|
pthread_join(ProfilerThread(), nullptr);
|
|
IsProfiling() = false; // yikes, this should be guarded by the lock!
|
|
}
|
|
|
|
} // namespace gemmlowp
|
|
|
|
#endif // GEMMLOWP_PROFILING_PROFILER_H_
|