You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
142 lines
5.9 KiB
142 lines
5.9 KiB
/* Copyright 2019 Google LLC. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#ifndef RUY_RUY_PREPACKED_CACHE_H_
|
|
#define RUY_RUY_PREPACKED_CACHE_H_
|
|
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <unordered_map>
|
|
|
|
#include "ruy/mat.h"
|
|
|
|
namespace ruy {
|
|
|
|
// "Low effort" Least Recently Used Cache for Prepacked Matrices
|
|
// A cache mechanism for prepacked matrices that ejects oldest entries.
|
|
// The implementation is "low effort" in the following ways:
|
|
// - we just linearly search for the oldest entry when doing an ejection
|
|
// - the ejection policy is very simple: if the new size would be above the
|
|
// . threshold, we will eject entries until the size is below the threshold.
|
|
// Current use cases (RNNs with GEMV operations) indicate that ejection is rare
|
|
// and memory constraints are tight, so we devote no additional storage to the
|
|
// LRU mechanism and accept O(n) search to eject oldest entry. In practice,
|
|
// the number of total entries has not been shown to be large.
|
|
//
|
|
// An instance of PrepackedCache is always owned by a Context. Just like
|
|
// Context, no "thread safety" consideration is applicable to this class:
|
|
// no two threads may simulaneously be accessing the same instance.
|
|
class PrepackedCache final {
|
|
public:
|
|
enum class Action { kGotExistingEntry, kInsertedNewEntry };
|
|
|
|
static constexpr int kDefaultMaxBuffersBytes = 1 << 28;
|
|
|
|
// A key in this key-value cache. Equality of keys implies interchangeable
|
|
// packed matrices, so we must be careful to make this Key type specific
|
|
// enough, and its equality comparison operator strict enough.
|
|
//
|
|
// These keys need to be used before the packed matrix buffers are allocated
|
|
// (since they are used to determine whether to allocate a new buffer).
|
|
// So they instead use the source matrix's data pointer. On the other hand,
|
|
// the packed matrix's layout structure is already available by the time we
|
|
// need to handle Keys, and that's fortunate because it is more specific
|
|
// than the source matrix's layout: it also encodes details about the kernel's
|
|
// small-scale block layout. In the past, we made the kernel function pointer
|
|
// part of the cache key, but all that is relevant here is the packed layout.
|
|
//
|
|
// The only other field that needs to be involved is the zero_point, for
|
|
// quantized matrices, although it seems far-fetched that the same matrix
|
|
// data would be reused with different zero_point values.
|
|
//
|
|
// The data types (PEMat::data_type and PEMat::sums_type) are omitted based on
|
|
// the "strict aliasing" model: each memory location should contain data of
|
|
// only one type. This could be relaxed in the future, by adding data types
|
|
// to this Key type, if a use case arises.
|
|
struct Key {
|
|
// The source matrix's data pointer.
|
|
const void* src_data;
|
|
// The packed matrix's layout, see PEMat::layout.
|
|
PMatLayout packed_layout;
|
|
// The packed matrix's zero point (for integer-quantized matrices only).
|
|
std::int32_t zero_point;
|
|
};
|
|
|
|
friend bool operator==(const Key& a, const Key& b) {
|
|
return a.src_data == b.src_data && a.packed_layout == b.packed_layout &&
|
|
a.zero_point == b.zero_point;
|
|
}
|
|
|
|
struct KeyHash {
|
|
std::size_t operator()(const Key&) const;
|
|
};
|
|
|
|
// A dummy timestamp to associate to each entry (see struct Entry) to
|
|
// determine which entry is "least recently used" when ejecting entries.
|
|
// This is just an integer counter, not related to physical time.
|
|
// It does not need to be atomic because only one thread uses an instance
|
|
// of PrepackedCache at a time (see class comment).
|
|
using Timestamp = std::uint64_t;
|
|
|
|
struct Entry {
|
|
PEMat packed_matrix;
|
|
Timestamp timestamp;
|
|
};
|
|
|
|
explicit PrepackedCache(int max_buffers_bytes = kDefaultMaxBuffersBytes)
|
|
: max_buffers_bytes_(max_buffers_bytes) {}
|
|
|
|
~PrepackedCache();
|
|
|
|
// Returns the total size in bytes of buffers held in this cache.
|
|
int BuffersBytes() const { return buffers_bytes_; }
|
|
|
|
// Returns the number of packed matrices held in this cache.
|
|
int MatrixCount() const { return cache_.size(); }
|
|
|
|
// This is the method by which new matrices are cached, and existing cache
|
|
// entries are queried.
|
|
// `src_data` is the source matrix data pointer.
|
|
// `packed_matrix` is a packed matrix structure where all fields have already
|
|
// been populated, except for the `data` and `sums` pointers which have not
|
|
// yet been allocated.
|
|
//
|
|
// This method:
|
|
// 1. Queries the cache for an entry matching the given `src_data` pointer and
|
|
// the relevant fields of `packed_matrix`, particularly its `layout`.
|
|
// 2. If a matching cache entry does not exist, it is created and inserted
|
|
// into the cache, and its `data` and `sums` buffers are allocated.
|
|
// 3. The `packed_matrix` has its `data` and `sums` pointers set to point
|
|
// to the allocated buffers.
|
|
// 4. The cache entry's timestamp is updated so it's the most recently used
|
|
// entry.
|
|
// 5. The return value is Action::kInsertedNewEntry if at step 2 a new
|
|
// entry was created. Otherwise it is Action::kGotExistingEntry.
|
|
Action Get(const void* src_data, PEMat* packed_matrix);
|
|
|
|
private:
|
|
void EjectOne();
|
|
void EjectUntilRoomFor(int new_bytes);
|
|
|
|
std::unordered_map<Key, Entry, KeyHash> cache_;
|
|
const int max_buffers_bytes_;
|
|
int buffers_bytes_ = 0;
|
|
Timestamp timestamp_ = 0;
|
|
};
|
|
|
|
} // namespace ruy
|
|
|
|
#endif // RUY_RUY_PREPACKED_CACHE_H_
|