v811_spc009/external/ruy/ruy/allocator.cc

/* Copyright 2019 Google LLC. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "ruy/allocator.h"

#include "ruy/opt_set.h"
#include "ruy/size_util.h"
#include "ruy/system_aligned_alloc.h"

namespace ruy {

Allocator::~Allocator() {
  FreeAll();
  detail::SystemAlignedFree(ptr_);
}

void* Allocator::AllocateFast(std::ptrdiff_t num_bytes) {
  if (current_ + num_bytes > size_) {
    return nullptr;
  }
  void* ret = static_cast<char*>(ptr_) + current_;
  current_ += num_bytes;
  return ret;
}

void* Allocator::AllocateSlow(std::ptrdiff_t num_bytes) {
  void* p = detail::SystemAlignedAlloc(num_bytes);
  fallback_blocks_total_size_ += num_bytes;
  fallback_blocks_.push_back(p);
  return p;
}

void* Allocator::AllocateBytes(std::ptrdiff_t num_bytes) {
  if (num_bytes == 0) {
    return nullptr;
  }
  const std::ptrdiff_t rounded_num_bytes =
      round_up_pot(num_bytes, detail::kMinimumBlockAlignment);
  if (void* p = AllocateFast(rounded_num_bytes)) {
    return p;
  }
  return AllocateSlow(rounded_num_bytes);
}

void* Allocator::AllocateBytesAvoidingAliasingWith(std::ptrdiff_t num_bytes,
                                                   const void* to_avoid) {
#if RUY_OPT(AVOID_ALIASING)
  if (num_bytes == 0) {
    return nullptr;
  }
  // The minimum L1D cache aliasing periodicity in bytes that we expect to
  // encounter on any device. This does not seem to be documented, but
  // empirically we observe the following:
  //   Cortex-A53:   1024
  //   Cortex-A55r1: 2048
  //   Cortex-A76:   not as easily observable.
  // Over-estimating this makes the AVOID_ALIASING optimization useless on
  // devices with lower periodicity.
  // Under-estimating this by 2x should be harmless.
  // Under-estimating this by a larger factor should gradually degrade
  // performance due to cache aliasing causing mutual eviction between
  // the packed matrix data, and the source matrix data being prefetched by the
  // CPU ahead of the packing code execution.
  static constexpr std::uint32_t kMinPeriod = 1024;
  static_assert(is_pot(kMinPeriod), "");
  void* p = AllocateBytes(num_bytes + kMinPeriod);
  auto unsigned_low_bits = [](const void* p) {
    return static_cast<std::uint32_t>(reinterpret_cast<std::uintptr_t>(p));
  };
  // This relies on unsigned integer overflow wrapping around.
  std::uint32_t diff_modulus =
      (unsigned_low_bits(p) - unsigned_low_bits(to_avoid)) % kMinPeriod;
  // diff_modulus is in [0, kMinPeriod).
  // We want it as close as possible to the middle of that interval,
  // kMinPeriod/2. The bad 'aliasing' case, that we are working to avoid,
  // is when diff_modulus is close to the ends of that interval, 0 or
  // kMinPeriod. So we want to add an offset of kMinPeriod/2 if it is in the
  // first or the last quarter of that interval.
  bool need_offset =
      diff_modulus < kMinPeriod / 4 || diff_modulus > 3 * kMinPeriod / 4;
  return static_cast<char*>(p) + (need_offset ? (kMinPeriod / 2) : 0);
#else
  (void)to_avoid;
  return AllocateBytes(num_bytes);
#endif
}

void Allocator::FreeAll() {
  current_ = 0;
  if (fallback_blocks_.empty()) {
    return;
  }

  // No rounding-up of the size means linear instead of logarithmic
  // bound on the number of allocation in some worst-case calling patterns.
  // This is considered worth it because minimizing memory usage is important
  // and actual calling patterns in applications that we care about still
  // reach the no-further-allocations steady state in a small finite number
  // of iterations.
  std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_;
  detail::SystemAlignedFree(ptr_);
  ptr_ = detail::SystemAlignedAlloc(new_size);
  size_ = new_size;

  for (void* p : fallback_blocks_) {
    detail::SystemAlignedFree(p);
  }
  fallback_blocks_.clear();
  fallback_blocks_total_size_ = 0;
}

}  // namespace ruy