You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
400 lines
14 KiB
400 lines
14 KiB
// Copyright 2019, VIXL authors
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are met:
|
|
//
|
|
// * Redistributions of source code must retain the above copyright notice,
|
|
// this list of conditions and the following disclaimer.
|
|
// * Redistributions in binary form must reproduce the above copyright notice,
|
|
// this list of conditions and the following disclaimer in the documentation
|
|
// and/or other materials provided with the distribution.
|
|
// * Neither the name of ARM Limited nor the names of its contributors may be
|
|
// used to endorse or promote products derived from this software without
|
|
// specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
|
|
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
|
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
#include <vector>
|
|
|
|
#include "globals-vixl.h"
|
|
#include "aarch64/macro-assembler-aarch64.h"
|
|
|
|
#include "bench-utils.h"
|
|
|
|
using namespace vixl;
|
|
using namespace vixl::aarch64;
|
|
|
|
#define __ masm_->
|
|
|
|
const Register BenchCodeGenerator::scratch = x28;
|
|
|
|
Register BenchCodeGenerator::PickR(unsigned size_in_bits) {
|
|
// Only select caller-saved registers [x0, x15].
|
|
return Register(static_cast<unsigned>(GetRandomBits(4)), size_in_bits);
|
|
}
|
|
|
|
VRegister BenchCodeGenerator::PickV(unsigned size_in_bits) {
|
|
// Only select caller-saved registers [v0, v7] or [v16, v31].
|
|
// The resulting distribution is not uniform.
|
|
unsigned code = static_cast<unsigned>(GetRandomBits(5));
|
|
if (code < 16) code &= 0x7; // [v8, v15] -> [v0, v7]
|
|
return VRegister(code, size_in_bits);
|
|
}
|
|
|
|
uint64_t BenchCodeGenerator::GetRandomBits(int bits) {
|
|
VIXL_ASSERT((bits >= 0) && (bits <= 64));
|
|
uint64_t result = 0;
|
|
|
|
while (bits >= 32) {
|
|
// For big chunks, call jrand48 directly.
|
|
result = (result << 32) | jrand48(rand_state_); // [-2^31, 2^31]
|
|
bits -= 32;
|
|
}
|
|
if (bits == 0) return result;
|
|
|
|
// We often only want a few bits at a time, so use stored entropy to avoid
|
|
// frequent calls to jrand48.
|
|
|
|
if (bits > rnd_bits_) {
|
|
// We want more bits than we have.
|
|
result = (result << rnd_bits_) | rnd_;
|
|
bits -= rnd_bits_;
|
|
|
|
rnd_ = static_cast<uint32_t>(jrand48(rand_state_)); // [-2^31, 2^31]
|
|
rnd_bits_ = 32;
|
|
}
|
|
|
|
VIXL_ASSERT(bits <= rnd_bits_);
|
|
result = (result << bits) | (rnd_ % (UINT32_C(1) << bits));
|
|
rnd_ >>= bits;
|
|
rnd_bits_ -= bits;
|
|
return result;
|
|
}
|
|
|
|
unsigned BenchCodeGenerator::PickRSize() {
|
|
return PickBool() ? kWRegSize : kXRegSize;
|
|
}
|
|
|
|
unsigned BenchCodeGenerator::PickFPSize() {
|
|
uint64_t entropy = GetRandomBits(4);
|
|
// Doubles and floats are common in most languages, so use half-precision
|
|
// types only rarely.
|
|
if (entropy == 0) return kHRegSize;
|
|
return ((entropy & 1) == 0) ? kSRegSize : kDRegSize;
|
|
}
|
|
|
|
void BenchCodeGenerator::Generate(size_t min_size_in_bytes) {
|
|
Label start;
|
|
__ Bind(&start);
|
|
|
|
call_depth_++;
|
|
GeneratePrologue();
|
|
|
|
while (masm_->GetSizeOfCodeGeneratedSince(&start) < min_size_in_bytes) {
|
|
GenerateArbitrarySequence();
|
|
}
|
|
|
|
GenerateEpilogue();
|
|
call_depth_--;
|
|
|
|
// Make sure that any labels (created by GenerateBranchSequence) are bound
|
|
// before we exit.
|
|
if (call_depth_ == 0) BindAllPendingLabels();
|
|
}
|
|
|
|
void BenchCodeGenerator::GeneratePrologue() {
|
|
// Construct a normal frame.
|
|
VIXL_ASSERT(masm_->StackPointer().Is(sp));
|
|
__ Push(lr, x29); // x29 is the frame pointer (fp).
|
|
__ Mov(x29, sp);
|
|
VIXL_ASSERT(call_depth_ > 0);
|
|
if (call_depth_ == 1) {
|
|
__ Push(scratch, xzr);
|
|
// Claim space to use for load and stores.
|
|
// - We need at least 4 * kQRegSize bytes for Ld4/St4.
|
|
// - The architecture requires that we allocate a multiple of 16 bytes.
|
|
// - There is no hard upper limit, but the Simulator has a limited stack
|
|
// space.
|
|
__ Claim((4 * kQRegSize) + (16 * GetRandomBits(3)));
|
|
__ Mov(scratch, sp);
|
|
}
|
|
}
|
|
|
|
void BenchCodeGenerator::GenerateEpilogue() {
|
|
VIXL_ASSERT(call_depth_ > 0);
|
|
if (call_depth_ == 1) {
|
|
__ Sub(sp, x29, 2 * kXRegSizeInBytes); // Drop the scratch space.
|
|
__ Pop(xzr, scratch);
|
|
}
|
|
__ Pop(x29, lr);
|
|
__ Ret();
|
|
}
|
|
|
|
void BenchCodeGenerator::GenerateArbitrarySequence() {
|
|
// Bind pending labels, and remove them from the list.
|
|
// Recently-linked labels are much more likely to be bound than old ones. This
|
|
// should produce a mix of long- (veneered) and short-range branches.
|
|
uint32_t bind_mask = static_cast<uint32_t>(
|
|
GetRandomBits(8) | (GetRandomBits(7) << 1) | (GetRandomBits(6) << 2));
|
|
BindPendingLabels(bind_mask);
|
|
|
|
// If we are at the top call level (call_depth_ == 1), generate nested calls
|
|
// 1/4 of the time, and halve the chance for each call level below that.
|
|
VIXL_ASSERT(call_depth_ > 0);
|
|
if (GetRandomBits(call_depth_ + 1) == 0) {
|
|
GenerateCallReturnSequence();
|
|
return;
|
|
}
|
|
|
|
// These weightings should be roughly representative of real functions.
|
|
switch (GetRandomBits(4)) {
|
|
case 0x0:
|
|
case 0x1:
|
|
GenerateTrivialSequence();
|
|
return;
|
|
case 0x2:
|
|
case 0x3:
|
|
case 0x4:
|
|
case 0x5:
|
|
GenerateOperandSequence();
|
|
return;
|
|
case 0x6:
|
|
case 0x7:
|
|
case 0x8:
|
|
GenerateMemOperandSequence();
|
|
return;
|
|
case 0xb:
|
|
case 0x9:
|
|
case 0xa:
|
|
GenerateImmediateSequence();
|
|
return;
|
|
case 0xc:
|
|
case 0xd:
|
|
GenerateBranchSequence();
|
|
return;
|
|
case 0xe:
|
|
GenerateFPSequence();
|
|
return;
|
|
case 0xf:
|
|
GenerateNEONSequence();
|
|
return;
|
|
}
|
|
}
|
|
|
|
void BenchCodeGenerator::GenerateTrivialSequence() {
|
|
unsigned size = PickRSize();
|
|
__ Asr(PickR(size), PickR(size), 4);
|
|
__ Bfi(PickR(size), PickR(size), 5, 14);
|
|
__ Bfc(PickR(size), 5, 14);
|
|
__ Cinc(PickR(size), PickR(size), ge);
|
|
__ Cinv(PickR(size), PickR(size), ne);
|
|
__ Cls(PickR(size), PickR(size));
|
|
__ Cneg(PickR(size), PickR(size), lt);
|
|
__ Mrs(PickX(), NZCV);
|
|
__ Nop();
|
|
__ Mul(PickR(size), PickR(size), PickR(size));
|
|
__ Rbit(PickR(size), PickR(size));
|
|
__ Rev(PickR(size), PickR(size));
|
|
__ Sdiv(PickR(size), PickR(size), PickR(size));
|
|
if (!labels_.empty()) {
|
|
__ Adr(PickX(), labels_.begin()->target);
|
|
}
|
|
}
|
|
|
|
void BenchCodeGenerator::GenerateOperandSequence() {
|
|
unsigned size = PickRSize();
|
|
// The cast to Operand is normally implicit for simple registers, but we
|
|
// explicitly specify it in every case here to ensure that the benchmark does
|
|
// what we expect.
|
|
__ And(PickR(size), PickR(size), Operand(PickR(size)));
|
|
__ Bics(PickR(size), PickR(size), Operand(PickR(size)));
|
|
__ Orr(PickR(size), PickR(size), Operand(PickR(size)));
|
|
__ Eor(PickR(size), PickR(size), Operand(PickR(size)));
|
|
__ Tst(PickR(size), Operand(PickR(size)));
|
|
__ Eon(PickR(size), PickR(size), Operand(PickR(size)));
|
|
__ Cmp(PickR(size), Operand(PickR(size)));
|
|
__ Negs(PickR(size), Operand(PickR(size)));
|
|
__ Mvn(PickR(size), Operand(PickR(size)));
|
|
__ Ccmp(PickR(size), Operand(PickR(size)), NoFlag, eq);
|
|
__ Ccmn(PickR(size), Operand(PickR(size)), NoFlag, eq);
|
|
__ Csel(PickR(size), Operand(PickR(size)), Operand(PickR(size)), lt);
|
|
{
|
|
// Ensure that `claim` doesn't alias any PickR().
|
|
UseScratchRegisterScope temps(masm_);
|
|
Register claim = temps.AcquireX();
|
|
// We should only claim a 16-byte-aligned amount, since we're using the
|
|
// system stack pointer.
|
|
__ Mov(claim, GetRandomBits(4) * 16);
|
|
__ Claim(Operand(claim));
|
|
// Also claim a bit more, so we can store at sp+claim.
|
|
__ Claim(Operand(32));
|
|
__ Poke(PickR(size), Operand(claim));
|
|
__ Peek(PickR(size), Operand(8));
|
|
__ Poke(PickR(size), Operand(16));
|
|
__ Peek(PickR(size), Operand(claim.W(), UXTW));
|
|
__ Drop(Operand(32));
|
|
__ Drop(Operand(claim));
|
|
}
|
|
}
|
|
|
|
void BenchCodeGenerator::GenerateMemOperandSequence() {
|
|
unsigned size = PickRSize();
|
|
RegList store_list = GetRandomBits(16); // Restrict to [x0, x15].
|
|
__ StoreCPURegList(CPURegList(CPURegister::kRegister, size, store_list),
|
|
MemOperand(scratch));
|
|
RegList load_list = GetRandomBits(16); // Restrict to [x0, x15].
|
|
__ LoadCPURegList(CPURegList(CPURegister::kRegister, size, load_list),
|
|
MemOperand(scratch));
|
|
__ Str(PickX(), MemOperand(scratch));
|
|
__ Strb(PickW(), MemOperand(scratch, 42));
|
|
__ Strh(PickW(), MemOperand(scratch, 42, PostIndex));
|
|
__ Ldrsw(PickX(), MemOperand(scratch, -42, PreIndex));
|
|
__ Ldr(PickR(size), MemOperand(scratch, 19)); // Translated to ldur.
|
|
__ Push(PickX(), PickX());
|
|
// Ensure unique registers (in [x0, x15]) for Pop.
|
|
__ Pop(Register(static_cast<int>(GetRandomBits(2)) + 0, kWRegSize),
|
|
Register(static_cast<int>(GetRandomBits(2)) + 4, kWRegSize),
|
|
Register(static_cast<int>(GetRandomBits(2)) + 8, kWRegSize),
|
|
Register(static_cast<int>(GetRandomBits(2)) + 12, kWRegSize));
|
|
}
|
|
|
|
void BenchCodeGenerator::GenerateImmediateSequence() {
|
|
unsigned size = PickRSize();
|
|
__ And(PickR(size), PickR(size), GetRandomBits(size));
|
|
__ Sub(PickR(size), PickR(size), GetRandomBits(size));
|
|
__ Mov(PickR(size), GetRandomBits(size));
|
|
__ Movk(PickX(), GetRandomBits(16), static_cast<int>(GetRandomBits(2)) * 16);
|
|
}
|
|
|
|
void BenchCodeGenerator::BindPendingLabels(uint64_t bind_mask) {
|
|
if (bind_mask == 0) return;
|
|
// The labels we bind here jump back to just after each branch that refers
|
|
// to them. This allows a simple, linear execution path, whilst still
|
|
// benchmarking long-range labels.
|
|
//
|
|
// Ensure that code falling through into this sequence does not jump
|
|
// back to an earlier point in the execution path.
|
|
Label done;
|
|
__ B(&done);
|
|
|
|
std::list<LabelPair>::iterator it = labels_.begin();
|
|
while ((it != labels_.end()) && (bind_mask != 0)) {
|
|
if ((bind_mask & 1) != 0) {
|
|
// Bind the label and jump back to its source.
|
|
__ Bind(it->target);
|
|
__ B(it->cont);
|
|
delete it->target;
|
|
delete it->cont;
|
|
it = labels_.erase(it);
|
|
} else {
|
|
++it; // Don't bind this one.
|
|
}
|
|
bind_mask >>= 1;
|
|
}
|
|
__ Bind(&done);
|
|
}
|
|
|
|
void BenchCodeGenerator::BindAllPendingLabels() {
|
|
while (!labels_.empty()) {
|
|
// BindPendingLables generates a branch over each block of bound labels.
|
|
// This will be repeated for each call here, but the effect is minimal and
|
|
// (empirically) we rarely accumulate more than 64 pending labels anyway.
|
|
BindPendingLabels(UINT64_MAX);
|
|
}
|
|
}
|
|
|
|
void BenchCodeGenerator::GenerateBranchSequence() {
|
|
{
|
|
LabelPair pair = {new Label(), new Label()};
|
|
__ B(lt, pair.target);
|
|
__ Bind(pair.cont);
|
|
labels_.push_front(pair);
|
|
}
|
|
|
|
{
|
|
LabelPair pair = {new Label(), new Label()};
|
|
__ Tbz(PickX(),
|
|
static_cast<int>(GetRandomBits(kXRegSizeLog2)),
|
|
pair.target);
|
|
__ Bind(pair.cont);
|
|
labels_.push_front(pair);
|
|
}
|
|
|
|
{
|
|
LabelPair pair = {new Label(), new Label()};
|
|
__ Cbz(PickX(), pair.target);
|
|
__ Bind(pair.cont);
|
|
labels_.push_front(pair);
|
|
}
|
|
}
|
|
|
|
void BenchCodeGenerator::GenerateCallReturnSequence() {
|
|
Label fn, done;
|
|
|
|
if (PickBool()) {
|
|
__ Bl(&fn);
|
|
} else {
|
|
Register reg = PickX();
|
|
__ Adr(reg, &fn);
|
|
__ Blr(reg);
|
|
}
|
|
__ B(&done);
|
|
|
|
__ Bind(&fn);
|
|
// Recurse with a randomised (but fairly small) minimum size.
|
|
Generate(GetRandomBits(8));
|
|
|
|
__ Bind(&done);
|
|
}
|
|
|
|
void BenchCodeGenerator::GenerateFPSequence() {
|
|
unsigned size = PickFPSize();
|
|
unsigned other_size = PickBool() ? size * 2 : size / 2;
|
|
if (other_size < kHRegSize) other_size = kDRegSize;
|
|
if (other_size > kDRegSize) other_size = kHRegSize;
|
|
|
|
__ Fadd(PickV(size), PickV(size), PickV(size));
|
|
__ Fmul(PickV(size), PickV(size), PickV(size));
|
|
__ Fcvt(PickV(other_size), PickV(size));
|
|
__ Fjcvtzs(PickW(), PickD());
|
|
__ Fccmp(PickV(size), PickV(size), NCVFlag, pl);
|
|
__ Fdiv(PickV(size), PickV(size), PickV(size));
|
|
__ Fmov(PickV(size), 1.25 * GetRandomBits(2));
|
|
__ Fmsub(PickV(size), PickV(size), PickV(size), PickV(size));
|
|
__ Frintn(PickV(size), PickV(size));
|
|
}
|
|
|
|
void BenchCodeGenerator::GenerateNEONSequence() {
|
|
__ And(PickV().V16B(), PickV().V16B(), PickV().V16B());
|
|
__ Sqrshl(PickV().V8H(), PickV().V8H(), PickV().V8H());
|
|
__ Umull(PickV().V2D(), PickV().V2S(), PickV().V2S());
|
|
__ Sqdmlal2(PickV().V4S(), PickV().V8H(), PickV().V8H());
|
|
|
|
// For structured loads and stores, we have to specify sequential (wrapped)
|
|
// registers, so start with [v16, v31] and allow them to wrap in to the
|
|
// [v0, v7] range.
|
|
VRegister vt(16 + static_cast<unsigned>(GetRandomBits(4)), kQRegSize);
|
|
VRegister vt2((vt.GetCode() + 1) % kNumberOfVRegisters, kQRegSize);
|
|
VRegister vt3((vt.GetCode() + 2) % kNumberOfVRegisters, kQRegSize);
|
|
VRegister vt4((vt.GetCode() + 3) % kNumberOfVRegisters, kQRegSize);
|
|
VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt));
|
|
VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt2));
|
|
VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt3));
|
|
VIXL_ASSERT(!kCalleeSavedV.IncludesAliasOf(vt4));
|
|
__ Ld3(vt.V4S(), vt2.V4S(), vt3.V4S(), MemOperand(scratch));
|
|
__ St4(vt.V16B(), vt2.V16B(), vt3.V16B(), vt4.V16B(), MemOperand(scratch));
|
|
|
|
__ Fmaxv(PickV().H(), PickV().V8H());
|
|
__ Fminp(PickV().V4S(), PickV().V4S(), PickV().V4S());
|
|
}
|