You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
354 lines
11 KiB
354 lines
11 KiB
//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
|
|
/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
|
|
/// with sequential versions where possible.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPUSubtarget.h"
|
|
#include "SIInstrInfo.h"
|
|
#include "SIMachineFunctionInfo.h"
|
|
#include "llvm/ADT/Statistic.h"
|
|
#include "llvm/CodeGen/LiveInterval.h"
|
|
#include "llvm/CodeGen/LiveIntervals.h"
|
|
#include "llvm/CodeGen/LiveRegMatrix.h"
|
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
|
#include "llvm/CodeGen/VirtRegMap.h"
|
|
#include "llvm/InitializePasses.h"
|
|
#include "llvm/Support/MathExtras.h"
|
|
#include <algorithm>
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "amdgpu-nsa-reassign"
|
|
|
|
STATISTIC(NumNSAInstructions,
|
|
"Number of NSA instructions with non-sequential address found");
|
|
STATISTIC(NumNSAConverted,
|
|
"Number of NSA instructions changed to sequential");
|
|
|
|
namespace {
|
|
|
|
class GCNNSAReassign : public MachineFunctionPass {
|
|
public:
|
|
static char ID;
|
|
|
|
GCNNSAReassign() : MachineFunctionPass(ID) {
|
|
initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
|
|
}
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
|
|
|
StringRef getPassName() const override { return "GCN NSA Reassign"; }
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.addRequired<LiveIntervals>();
|
|
AU.addRequired<VirtRegMap>();
|
|
AU.addRequired<LiveRegMatrix>();
|
|
AU.setPreservesAll();
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
}
|
|
|
|
private:
|
|
typedef enum {
|
|
NOT_NSA, // Not an NSA instruction
|
|
FIXED, // NSA which we cannot modify
|
|
NON_CONTIGUOUS, // NSA with non-sequential address which we can try
|
|
// to optimize.
|
|
CONTIGUOUS // NSA with all sequential address registers
|
|
} NSA_Status;
|
|
|
|
const GCNSubtarget *ST;
|
|
|
|
const MachineRegisterInfo *MRI;
|
|
|
|
const SIRegisterInfo *TRI;
|
|
|
|
VirtRegMap *VRM;
|
|
|
|
LiveRegMatrix *LRM;
|
|
|
|
LiveIntervals *LIS;
|
|
|
|
unsigned MaxNumVGPRs;
|
|
|
|
const MCPhysReg *CSRegs;
|
|
|
|
NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
|
|
|
|
bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
|
|
unsigned StartReg) const;
|
|
|
|
bool canAssign(unsigned StartReg, unsigned NumRegs) const;
|
|
|
|
bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
|
|
};
|
|
|
|
} // End anonymous namespace.
|
|
|
|
INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
|
|
false, false)
|
|
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
|
|
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
|
|
INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
|
|
INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
|
|
false, false)
|
|
|
|
|
|
char GCNNSAReassign::ID = 0;
|
|
|
|
char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
|
|
|
|
bool
|
|
GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
|
|
unsigned StartReg) const {
|
|
unsigned NumRegs = Intervals.size();
|
|
|
|
for (unsigned N = 0; N < NumRegs; ++N)
|
|
if (VRM->hasPhys(Intervals[N]->reg()))
|
|
LRM->unassign(*Intervals[N]);
|
|
|
|
for (unsigned N = 0; N < NumRegs; ++N)
|
|
if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N)))
|
|
return false;
|
|
|
|
for (unsigned N = 0; N < NumRegs; ++N)
|
|
LRM->assign(*Intervals[N], MCRegister::from(StartReg + N));
|
|
|
|
return true;
|
|
}
|
|
|
|
bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
|
|
for (unsigned N = 0; N < NumRegs; ++N) {
|
|
unsigned Reg = StartReg + N;
|
|
if (!MRI->isAllocatable(Reg))
|
|
return false;
|
|
|
|
for (unsigned I = 0; CSRegs[I]; ++I)
|
|
if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
|
|
!LRM->isPhysRegUsed(CSRegs[I]))
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
|
|
unsigned NumRegs = Intervals.size();
|
|
|
|
if (NumRegs > MaxNumVGPRs)
|
|
return false;
|
|
unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
|
|
|
|
for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
|
|
if (!canAssign(Reg, NumRegs))
|
|
continue;
|
|
|
|
if (tryAssignRegisters(Intervals, Reg))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
GCNNSAReassign::NSA_Status
|
|
GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
|
|
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
|
|
if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
|
|
return NSA_Status::NOT_NSA;
|
|
|
|
int VAddr0Idx =
|
|
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
|
|
|
|
unsigned VgprBase = 0;
|
|
bool NSA = false;
|
|
for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
|
|
const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
|
|
Register Reg = Op.getReg();
|
|
if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
|
|
return NSA_Status::FIXED;
|
|
|
|
Register PhysReg = VRM->getPhys(Reg);
|
|
|
|
if (!Fast) {
|
|
if (!PhysReg)
|
|
return NSA_Status::FIXED;
|
|
|
|
// Bail if address is not a VGPR32. That should be possible to extend the
|
|
// optimization to work with subregs of a wider register tuples, but the
|
|
// logic to find free registers will be much more complicated with much
|
|
// less chances for success. That seems reasonable to assume that in most
|
|
// cases a tuple is used because a vector variable contains different
|
|
// parts of an address and it is either already consequitive or cannot
|
|
// be reassigned if not. If needed it is better to rely on register
|
|
// coalescer to process such address tuples.
|
|
if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
|
|
return NSA_Status::FIXED;
|
|
|
|
const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
|
|
|
|
if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
|
|
return NSA_Status::FIXED;
|
|
|
|
for (auto U : MRI->use_nodbg_operands(Reg)) {
|
|
if (U.isImplicit())
|
|
return NSA_Status::FIXED;
|
|
const MachineInstr *UseInst = U.getParent();
|
|
if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
|
|
return NSA_Status::FIXED;
|
|
}
|
|
|
|
if (!LIS->hasInterval(Reg))
|
|
return NSA_Status::FIXED;
|
|
}
|
|
|
|
if (I == 0)
|
|
VgprBase = PhysReg;
|
|
else if (VgprBase + I != PhysReg)
|
|
NSA = true;
|
|
}
|
|
|
|
return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
|
|
}
|
|
|
|
bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
|
|
ST = &MF.getSubtarget<GCNSubtarget>();
|
|
if (ST->getGeneration() < GCNSubtarget::GFX10)
|
|
return false;
|
|
|
|
MRI = &MF.getRegInfo();
|
|
TRI = ST->getRegisterInfo();
|
|
VRM = &getAnalysis<VirtRegMap>();
|
|
LRM = &getAnalysis<LiveRegMatrix>();
|
|
LIS = &getAnalysis<LiveIntervals>();
|
|
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
|
MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
|
|
MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
|
|
CSRegs = MRI->getCalleeSavedRegs();
|
|
|
|
using Candidate = std::pair<const MachineInstr*, bool>;
|
|
SmallVector<Candidate, 32> Candidates;
|
|
for (const MachineBasicBlock &MBB : MF) {
|
|
for (const MachineInstr &MI : MBB) {
|
|
switch (CheckNSA(MI)) {
|
|
default:
|
|
continue;
|
|
case NSA_Status::CONTIGUOUS:
|
|
Candidates.push_back(std::make_pair(&MI, true));
|
|
break;
|
|
case NSA_Status::NON_CONTIGUOUS:
|
|
Candidates.push_back(std::make_pair(&MI, false));
|
|
++NumNSAInstructions;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Changed = false;
|
|
for (auto &C : Candidates) {
|
|
if (C.second)
|
|
continue;
|
|
|
|
const MachineInstr *MI = C.first;
|
|
if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
|
|
// Already happen to be fixed.
|
|
C.second = true;
|
|
++NumNSAConverted;
|
|
continue;
|
|
}
|
|
|
|
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
|
|
int VAddr0Idx =
|
|
AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
|
|
|
|
SmallVector<LiveInterval *, 16> Intervals;
|
|
SmallVector<MCRegister, 16> OrigRegs;
|
|
SlotIndex MinInd, MaxInd;
|
|
for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
|
|
const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
|
|
Register Reg = Op.getReg();
|
|
LiveInterval *LI = &LIS->getInterval(Reg);
|
|
if (llvm::is_contained(Intervals, LI)) {
|
|
// Same register used, unable to make sequential
|
|
Intervals.clear();
|
|
break;
|
|
}
|
|
Intervals.push_back(LI);
|
|
OrigRegs.push_back(VRM->getPhys(Reg));
|
|
if (LI->empty()) {
|
|
// The address input is undef, so it doesn't contribute to the relevant
|
|
// range. Seed a reasonable index range if required.
|
|
if (I == 0)
|
|
MinInd = MaxInd = LIS->getInstructionIndex(*MI);
|
|
continue;
|
|
}
|
|
MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
|
|
MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
|
|
}
|
|
|
|
if (Intervals.empty())
|
|
continue;
|
|
|
|
LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
|
|
<< "\tOriginal allocation:\t";
|
|
for (auto *LI
|
|
: Intervals) dbgs()
|
|
<< " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
|
|
dbgs() << '\n');
|
|
|
|
bool Success = scavengeRegs(Intervals);
|
|
if (!Success) {
|
|
LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
|
|
if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation.
|
|
continue;
|
|
} else {
|
|
// Check we did not make it worse for other instructions.
|
|
auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
|
|
[this](const Candidate &C, SlotIndex I) {
|
|
return LIS->getInstructionIndex(*C.first) < I;
|
|
});
|
|
for (auto E = Candidates.end(); Success && I != E &&
|
|
LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
|
|
if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
|
|
Success = false;
|
|
LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!Success) {
|
|
for (unsigned I = 0; I < Info->VAddrDwords; ++I)
|
|
if (VRM->hasPhys(Intervals[I]->reg()))
|
|
LRM->unassign(*Intervals[I]);
|
|
|
|
for (unsigned I = 0; I < Info->VAddrDwords; ++I)
|
|
LRM->assign(*Intervals[I], OrigRegs[I]);
|
|
|
|
continue;
|
|
}
|
|
|
|
C.second = true;
|
|
++NumNSAConverted;
|
|
LLVM_DEBUG(
|
|
dbgs() << "\tNew allocation:\t\t ["
|
|
<< llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
|
|
<< " : "
|
|
<< llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
|
|
<< "]\n");
|
|
Changed = true;
|
|
}
|
|
|
|
return Changed;
|
|
}
|