You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
125 lines
4.3 KiB
125 lines
4.3 KiB
/* Copyright 2019 Google LLC. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
#include "ruy/allocator.h"
|
|
|
|
#include "ruy/opt_set.h"
|
|
#include "ruy/size_util.h"
|
|
#include "ruy/system_aligned_alloc.h"
|
|
|
|
namespace ruy {
|
|
|
|
Allocator::~Allocator() {
|
|
FreeAll();
|
|
detail::SystemAlignedFree(ptr_);
|
|
}
|
|
|
|
void* Allocator::AllocateFast(std::ptrdiff_t num_bytes) {
|
|
if (current_ + num_bytes > size_) {
|
|
return nullptr;
|
|
}
|
|
void* ret = static_cast<char*>(ptr_) + current_;
|
|
current_ += num_bytes;
|
|
return ret;
|
|
}
|
|
|
|
void* Allocator::AllocateSlow(std::ptrdiff_t num_bytes) {
|
|
void* p = detail::SystemAlignedAlloc(num_bytes);
|
|
fallback_blocks_total_size_ += num_bytes;
|
|
fallback_blocks_.push_back(p);
|
|
return p;
|
|
}
|
|
|
|
void* Allocator::AllocateBytes(std::ptrdiff_t num_bytes) {
|
|
if (num_bytes == 0) {
|
|
return nullptr;
|
|
}
|
|
const std::ptrdiff_t rounded_num_bytes =
|
|
round_up_pot(num_bytes, detail::kMinimumBlockAlignment);
|
|
if (void* p = AllocateFast(rounded_num_bytes)) {
|
|
return p;
|
|
}
|
|
return AllocateSlow(rounded_num_bytes);
|
|
}
|
|
|
|
void* Allocator::AllocateBytesAvoidingAliasingWith(std::ptrdiff_t num_bytes,
|
|
const void* to_avoid) {
|
|
#if RUY_OPT(AVOID_ALIASING)
|
|
if (num_bytes == 0) {
|
|
return nullptr;
|
|
}
|
|
// The minimum L1D cache aliasing periodicity in bytes that we expect to
|
|
// encounter on any device. This does not seem to be documented, but
|
|
// empirically we observe the following:
|
|
// Cortex-A53: 1024
|
|
// Cortex-A55r1: 2048
|
|
// Cortex-A76: not as easily observable.
|
|
// Over-estimating this makes the AVOID_ALIASING optimization useless on
|
|
// devices with lower periodicity.
|
|
// Under-estimating this by 2x should be harmless.
|
|
// Under-estimating this by a larger factor should gradually degrade
|
|
// performance due to cache aliasing causing mutual eviction between
|
|
// the packed matrix data, and the source matrix data being prefetched by the
|
|
// CPU ahead of the packing code execution.
|
|
static constexpr std::uint32_t kMinPeriod = 1024;
|
|
static_assert(is_pot(kMinPeriod), "");
|
|
void* p = AllocateBytes(num_bytes + kMinPeriod);
|
|
auto unsigned_low_bits = [](const void* p) {
|
|
return static_cast<std::uint32_t>(reinterpret_cast<std::uintptr_t>(p));
|
|
};
|
|
// This relies on unsigned integer overflow wrapping around.
|
|
std::uint32_t diff_modulus =
|
|
(unsigned_low_bits(p) - unsigned_low_bits(to_avoid)) % kMinPeriod;
|
|
// diff_modulus is in [0, kMinPeriod).
|
|
// We want it as close as possible to the middle of that interval,
|
|
// kMinPeriod/2. The bad 'aliasing' case, that we are working to avoid,
|
|
// is when diff_modulus is close to the ends of that interval, 0 or
|
|
// kMinPeriod. So we want to add an offset of kMinPeriod/2 if it is in the
|
|
// first or the last quarter of that interval.
|
|
bool need_offset =
|
|
diff_modulus < kMinPeriod / 4 || diff_modulus > 3 * kMinPeriod / 4;
|
|
return static_cast<char*>(p) + (need_offset ? (kMinPeriod / 2) : 0);
|
|
#else
|
|
(void)to_avoid;
|
|
return AllocateBytes(num_bytes);
|
|
#endif
|
|
}
|
|
|
|
void Allocator::FreeAll() {
|
|
current_ = 0;
|
|
if (fallback_blocks_.empty()) {
|
|
return;
|
|
}
|
|
|
|
// No rounding-up of the size means linear instead of logarithmic
|
|
// bound on the number of allocation in some worst-case calling patterns.
|
|
// This is considered worth it because minimizing memory usage is important
|
|
// and actual calling patterns in applications that we care about still
|
|
// reach the no-further-allocations steady state in a small finite number
|
|
// of iterations.
|
|
std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_;
|
|
detail::SystemAlignedFree(ptr_);
|
|
ptr_ = detail::SystemAlignedAlloc(new_size);
|
|
size_ = new_size;
|
|
|
|
for (void* p : fallback_blocks_) {
|
|
detail::SystemAlignedFree(p);
|
|
}
|
|
fallback_blocks_.clear();
|
|
fallback_blocks_total_size_ = 0;
|
|
}
|
|
|
|
} // namespace ruy
|