You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

667 lines
28 KiB

//===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a pass to convert gpu.launch_func op into a sequence of
// GPU runtime calls. As most of GPU runtimes does not have a stable published
// ABI, this pass uses a slim runtime layer that builds on top of the public
// API from GPU runtime headers.
//
//===----------------------------------------------------------------------===//
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "../PassDetail.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/BuiltinTypes.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FormatVariadic.h"
using namespace mlir;
static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
namespace {
class GpuToLLVMConversionPass
: public GpuToLLVMConversionPassBase<GpuToLLVMConversionPass> {
public:
GpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
if (!gpuBinaryAnnotation.empty())
this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
}
// Run the dialect converter on the module.
void runOnOperation() override;
};
class FunctionCallBuilder {
public:
FunctionCallBuilder(StringRef functionName, LLVM::LLVMType returnType,
ArrayRef<LLVM::LLVMType> argumentTypes)
: functionName(functionName),
functionType(LLVM::LLVMType::getFunctionTy(returnType, argumentTypes,
/*isVarArg=*/false)) {}
LLVM::CallOp create(Location loc, OpBuilder &builder,
ArrayRef<Value> arguments) const;
private:
StringRef functionName;
LLVM::LLVMType functionType;
};
template <typename OpTy>
class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
public:
explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
: ConvertOpToLLVMPattern<OpTy>(typeConverter) {}
protected:
MLIRContext *context = &this->getTypeConverter()->getContext();
LLVM::LLVMType llvmVoidType = LLVM::LLVMType::getVoidTy(context);
LLVM::LLVMType llvmPointerType = LLVM::LLVMType::getInt8PtrTy(context);
LLVM::LLVMType llvmPointerPointerType = llvmPointerType.getPointerTo();
LLVM::LLVMType llvmInt8Type = LLVM::LLVMType::getInt8Ty(context);
LLVM::LLVMType llvmInt32Type = LLVM::LLVMType::getInt32Ty(context);
LLVM::LLVMType llvmInt64Type = LLVM::LLVMType::getInt64Ty(context);
LLVM::LLVMType llvmIntPtrType = LLVM::LLVMType::getIntNTy(
context, this->getTypeConverter()->getPointerBitwidth(0));
FunctionCallBuilder moduleLoadCallBuilder = {
"mgpuModuleLoad",
llvmPointerType /* void *module */,
{llvmPointerType /* void *cubin */}};
FunctionCallBuilder moduleUnloadCallBuilder = {
"mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}};
FunctionCallBuilder moduleGetFunctionCallBuilder = {
"mgpuModuleGetFunction",
llvmPointerType /* void *function */,
{
llvmPointerType, /* void *module */
llvmPointerType /* char *name */
}};
FunctionCallBuilder launchKernelCallBuilder = {
"mgpuLaunchKernel",
llvmVoidType,
{
llvmPointerType, /* void* f */
llvmIntPtrType, /* intptr_t gridXDim */
llvmIntPtrType, /* intptr_t gridyDim */
llvmIntPtrType, /* intptr_t gridZDim */
llvmIntPtrType, /* intptr_t blockXDim */
llvmIntPtrType, /* intptr_t blockYDim */
llvmIntPtrType, /* intptr_t blockZDim */
llvmInt32Type, /* unsigned int sharedMemBytes */
llvmPointerType, /* void *hstream */
llvmPointerPointerType, /* void **kernelParams */
llvmPointerPointerType /* void **extra */
}};
FunctionCallBuilder streamCreateCallBuilder = {
"mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
FunctionCallBuilder streamDestroyCallBuilder = {
"mgpuStreamDestroy", llvmVoidType, {llvmPointerType /* void *stream */}};
FunctionCallBuilder streamSynchronizeCallBuilder = {
"mgpuStreamSynchronize",
llvmVoidType,
{llvmPointerType /* void *stream */}};
FunctionCallBuilder streamWaitEventCallBuilder = {
"mgpuStreamWaitEvent",
llvmVoidType,
{llvmPointerType /* void *stream */, llvmPointerType /* void *event */}};
FunctionCallBuilder eventCreateCallBuilder = {
"mgpuEventCreate", llvmPointerType /* void *event */, {}};
FunctionCallBuilder eventDestroyCallBuilder = {
"mgpuEventDestroy", llvmVoidType, {llvmPointerType /* void *event */}};
FunctionCallBuilder eventSynchronizeCallBuilder = {
"mgpuEventSynchronize",
llvmVoidType,
{llvmPointerType /* void *event */}};
FunctionCallBuilder eventRecordCallBuilder = {
"mgpuEventRecord",
llvmVoidType,
{llvmPointerType /* void *event */, llvmPointerType /* void *stream */}};
FunctionCallBuilder hostRegisterCallBuilder = {
"mgpuMemHostRegisterMemRef",
llvmVoidType,
{llvmIntPtrType /* intptr_t rank */,
llvmPointerType /* void *memrefDesc */,
llvmIntPtrType /* intptr_t elementSizeBytes */}};
FunctionCallBuilder allocCallBuilder = {
"mgpuMemAlloc",
llvmPointerType /* void * */,
{llvmIntPtrType /* intptr_t sizeBytes */,
llvmPointerType /* void *stream */}};
FunctionCallBuilder deallocCallBuilder = {
"mgpuMemFree",
llvmVoidType,
{llvmPointerType /* void *ptr */, llvmPointerType /* void *stream */}};
};
/// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertHostRegisterOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> {
public:
ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
: ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp>(typeConverter) {}
private:
LogicalResult
matchAndRewrite(gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override;
};
/// A rewrite pattern to convert gpu.alloc operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertAllocOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp> {
public:
ConvertAllocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
: ConvertOpToGpuRuntimeCallPattern<gpu::AllocOp>(typeConverter) {}
private:
LogicalResult
matchAndRewrite(gpu::AllocOp allocOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override;
};
/// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertDeallocOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp> {
public:
ConvertDeallocOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
: ConvertOpToGpuRuntimeCallPattern<gpu::DeallocOp>(typeConverter) {}
private:
LogicalResult
matchAndRewrite(gpu::DeallocOp deallocOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override;
};
/// A rewrite pattern to convert gpu.wait operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertWaitOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
public:
ConvertWaitOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
: ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}
private:
LogicalResult
matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override;
};
/// A rewrite pattern to convert gpu.wait async operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertWaitAsyncOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp> {
public:
ConvertWaitAsyncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
: ConvertOpToGpuRuntimeCallPattern<gpu::WaitOp>(typeConverter) {}
private:
LogicalResult
matchAndRewrite(gpu::WaitOp waitOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override;
};
/// A rewrite patter to convert gpu.launch_func operations into a sequence of
/// GPU runtime calls. Currently it supports CUDA and ROCm (HIP).
///
/// In essence, a gpu.launch_func operations gets compiled into the following
/// sequence of runtime calls:
///
/// * moduleLoad -- loads the module given the cubin / hsaco data
/// * moduleGetFunction -- gets a handle to the actual kernel function
/// * getStreamHelper -- initializes a new compute stream on GPU
/// * launchKernel -- launches the kernel on a stream
/// * streamSynchronize -- waits for operations on the stream to finish
///
/// Intermediate data structures are allocated on the stack.
class ConvertLaunchFuncOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> {
public:
ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter,
StringRef gpuBinaryAnnotation)
: ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter),
gpuBinaryAnnotation(gpuBinaryAnnotation) {}
private:
Value generateParamsArray(gpu::LaunchFuncOp launchOp,
ArrayRef<Value> operands, OpBuilder &builder) const;
Value generateKernelNameConstant(StringRef moduleName, StringRef name,
Location loc, OpBuilder &builder) const;
LogicalResult
matchAndRewrite(gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override;
llvm::SmallString<32> gpuBinaryAnnotation;
};
class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
using OpRewritePattern<gpu::GPUModuleOp>::OpRewritePattern;
LogicalResult matchAndRewrite(gpu::GPUModuleOp op,
PatternRewriter &rewriter) const override {
// GPU kernel modules are no longer necessary since we have a global
// constant with the CUBIN, or HSACO data.
rewriter.eraseOp(op);
return success();
}
};
} // namespace
void GpuToLLVMConversionPass::runOnOperation() {
LLVMTypeConverter converter(&getContext());
OwningRewritePatternList patterns;
populateStdToLLVMConversionPatterns(converter, patterns);
populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);
LLVMConversionTarget target(getContext());
if (failed(
applyPartialConversion(getOperation(), target, std::move(patterns))))
signalPassFailure();
}
LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
ArrayRef<Value> arguments) const {
auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>();
auto function = [&] {
if (auto function = module.lookupSymbol<LLVM::LLVMFuncOp>(functionName))
return function;
return OpBuilder(module.getBody()->getTerminator())
.create<LLVM::LLVMFuncOp>(loc, functionName, functionType);
}();
return builder.create<LLVM::CallOp>(
loc, const_cast<LLVM::LLVMType &>(functionType).getFunctionResultType(),
builder.getSymbolRefAttr(function), arguments);
}
// Returns whether all operands are of LLVM type.
static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands,
ConversionPatternRewriter &rewriter) {
if (!llvm::all_of(operands, [](Value value) {
return value.getType().isa<LLVM::LLVMType>();
}))
return rewriter.notifyMatchFailure(
op, "Cannot convert if operands aren't of LLVM type.");
return success();
}
static LogicalResult
isAsyncWithOneDependency(ConversionPatternRewriter &rewriter,
gpu::AsyncOpInterface op) {
if (op.getAsyncDependencies().size() != 1)
return rewriter.notifyMatchFailure(
op, "Can only convert with exactly one async dependency.");
if (!op.getAsyncToken())
return rewriter.notifyMatchFailure(op, "Can convert only async version.");
return success();
}
LogicalResult ConvertHostRegisterOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::HostRegisterOp hostRegisterOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {
auto *op = hostRegisterOp.getOperation();
if (failed(areAllLLVMTypes(op, operands, rewriter)))
return failure();
Location loc = op->getLoc();
auto memRefType = hostRegisterOp.value().getType();
auto elementType = memRefType.cast<UnrankedMemRefType>().getElementType();
auto elementSize = getSizeInBytes(loc, elementType, rewriter);
auto arguments = getTypeConverter()->promoteOperands(loc, op->getOperands(),
operands, rewriter);
arguments.push_back(elementSize);
hostRegisterCallBuilder.create(loc, rewriter, arguments);
rewriter.eraseOp(op);
return success();
}
LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::AllocOp allocOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {
MemRefType memRefType = allocOp.getType();
if (failed(areAllLLVMTypes(allocOp, operands, rewriter)) ||
!isSupportedMemRefType(memRefType) ||
failed(isAsyncWithOneDependency(rewriter, allocOp)))
return failure();
auto loc = allocOp.getLoc();
// Get shape of the memref as values: static sizes are constant
// values and dynamic sizes are passed to 'alloc' as operands.
SmallVector<Value, 4> shape;
SmallVector<Value, 4> strides;
Value sizeBytes;
getMemRefDescriptorSizes(loc, memRefType, operands, rewriter, shape, strides,
sizeBytes);
// Allocate the underlying buffer and store a pointer to it in the MemRef
// descriptor.
Type elementPtrType = this->getElementPtrType(memRefType);
auto adaptor = gpu::AllocOpAdaptor(operands, allocOp->getAttrDictionary());
auto stream = adaptor.asyncDependencies().front();
Value allocatedPtr =
allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(0);
allocatedPtr =
rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
// No alignment.
Value alignedPtr = allocatedPtr;
// Create the MemRef descriptor.
auto memRefDescriptor = this->createMemRefDescriptor(
loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter);
rewriter.replaceOp(allocOp, {memRefDescriptor, stream});
return success();
}
LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::DeallocOp deallocOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {
if (failed(areAllLLVMTypes(deallocOp, operands, rewriter)) ||
failed(isAsyncWithOneDependency(rewriter, deallocOp)))
return failure();
Location loc = deallocOp.getLoc();
auto adaptor =
gpu::DeallocOpAdaptor(operands, deallocOp->getAttrDictionary());
Value pointer =
MemRefDescriptor(adaptor.memref()).allocatedPtr(rewriter, loc);
auto casted = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer);
Value stream = adaptor.asyncDependencies().front();
deallocCallBuilder.create(loc, rewriter, {casted, stream});
rewriter.replaceOp(deallocOp, {stream});
return success();
}
// Converts `gpu.wait` to runtime calls. The operands are all CUDA or ROCm
// streams (i.e. void*). The converted op synchronizes the host with every
// stream and then destroys it. That is, it assumes that the stream is not used
// afterwards. In case this isn't correct, we will get a runtime error.
// Eventually, we will have a pass that guarantees this property.
LogicalResult ConvertWaitOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::WaitOp waitOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {
if (waitOp.asyncToken())
return rewriter.notifyMatchFailure(waitOp, "Cannot convert async op.");
Location loc = waitOp.getLoc();
for (auto asyncDependency : operands)
streamSynchronizeCallBuilder.create(loc, rewriter, {asyncDependency});
for (auto asyncDependency : operands)
streamDestroyCallBuilder.create(loc, rewriter, {asyncDependency});
rewriter.eraseOp(waitOp);
return success();
}
// Converts `gpu.wait async` to runtime calls. The result is a new stream that
// is synchronized with all operands, which are CUDA or ROCm streams (i.e.
// void*). We create and record an event after the definition of the stream
// and make the new stream wait on that event before destroying it again. This
// assumes that there is no other use between the definition and this op, and
// the plan is to have a pass that guarantees this property.
LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::WaitOp waitOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {
if (!waitOp.asyncToken())
return rewriter.notifyMatchFailure(waitOp, "Can only convert async op.");
Location loc = waitOp.getLoc();
auto insertionPoint = rewriter.saveInsertionPoint();
SmallVector<Value, 1> events;
for (auto pair : llvm::zip(waitOp.asyncDependencies(), operands)) {
auto token = std::get<0>(pair);
if (auto *defOp = token.getDefiningOp()) {
rewriter.setInsertionPointAfter(defOp);
} else {
// If we can't find the defining op, we record the event at block start,
// which is late and therefore misses parallelism, but still valid.
rewriter.setInsertionPointToStart(waitOp->getBlock());
}
auto event = eventCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
auto stream = std::get<1>(pair);
eventRecordCallBuilder.create(loc, rewriter, {event, stream});
events.push_back(event);
}
rewriter.restoreInsertionPoint(insertionPoint);
auto stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0);
for (auto event : events)
streamWaitEventCallBuilder.create(loc, rewriter, {stream, event});
for (auto event : events)
eventDestroyCallBuilder.create(loc, rewriter, {event});
rewriter.replaceOp(waitOp, {stream});
return success();
}
// Creates a struct containing all kernel parameters on the stack and returns
// an array of type-erased pointers to the fields of the struct. The array can
// then be passed to the CUDA / ROCm (HIP) kernel launch calls.
// The generated code is essentially as follows:
//
// %struct = alloca(sizeof(struct { Parameters... }))
// %array = alloca(NumParameters * sizeof(void *))
// for (i : [0, NumParameters))
// %fieldPtr = llvm.getelementptr %struct[0, i]
// llvm.store parameters[i], %fieldPtr
// %elementPtr = llvm.getelementptr %array[i]
// llvm.store %fieldPtr, %elementPtr
// return %array
Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
OpBuilder &builder) const {
auto loc = launchOp.getLoc();
auto numKernelOperands = launchOp.getNumKernelOperands();
auto arguments = getTypeConverter()->promoteOperands(
loc, launchOp.getOperands().take_back(numKernelOperands),
operands.take_back(numKernelOperands), builder);
auto numArguments = arguments.size();
SmallVector<LLVM::LLVMType, 4> argumentTypes;
argumentTypes.reserve(numArguments);
for (auto argument : arguments)
argumentTypes.push_back(argument.getType().cast<LLVM::LLVMType>());
auto structType = LLVM::LLVMType::createStructTy(argumentTypes, StringRef());
auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
builder.getI32IntegerAttr(1));
auto structPtr = builder.create<LLVM::AllocaOp>(
loc, structType.getPointerTo(), one, /*alignment=*/0);
auto arraySize = builder.create<LLVM::ConstantOp>(
loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments));
auto arrayPtr = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
arraySize, /*alignment=*/0);
auto zero = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
builder.getI32IntegerAttr(0));
for (auto en : llvm::enumerate(arguments)) {
auto index = builder.create<LLVM::ConstantOp>(
loc, llvmInt32Type, builder.getI32IntegerAttr(en.index()));
auto fieldPtr = builder.create<LLVM::GEPOp>(
loc, argumentTypes[en.index()].getPointerTo(), structPtr,
ArrayRef<Value>{zero, index.getResult()});
builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr);
auto elementPtr = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType,
arrayPtr, index.getResult());
auto casted =
builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr);
builder.create<LLVM::StoreOp>(loc, casted, elementPtr);
}
return arrayPtr;
}
// Generates an LLVM IR dialect global that contains the name of the given
// kernel function as a C string, and returns a pointer to its beginning.
// The code is essentially:
//
// llvm.global constant @kernel_name("function_name\00")
// func(...) {
// %0 = llvm.addressof @kernel_name
// %1 = llvm.constant (0 : index)
// %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
// }
Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant(
StringRef moduleName, StringRef name, Location loc,
OpBuilder &builder) const {
// Make sure the trailing zero is included in the constant.
std::vector<char> kernelName(name.begin(), name.end());
kernelName.push_back('\0');
std::string globalName =
std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));
return LLVM::createGlobalString(
loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
LLVM::Linkage::Internal);
}
// Emits LLVM IR to launch a kernel function. Expects the module that contains
// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
//
// %0 = call %binarygetter
// %1 = call %moduleLoad(%0)
// %2 = <see generateKernelNameConstant>
// %3 = call %moduleGetFunction(%1, %2)
// %4 = call %streamCreate()
// %5 = <see generateParamsArray>
// call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
// call %streamSynchronize(%4)
// call %streamDestroy(%4)
// call %moduleUnload(%1)
//
// If the op is async, the stream corresponds to the (single) async dependency
// as well as the async token the op produces.
LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::LaunchFuncOp launchOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const {
if (failed(areAllLLVMTypes(launchOp, operands, rewriter)))
return failure();
if (launchOp.asyncDependencies().size() > 1)
return rewriter.notifyMatchFailure(
launchOp, "Cannot convert with more than one async dependency.");
// Fail when the synchronous version of the op has async dependencies. The
// lowering destroys the stream, and we do not want to check that there is no
// use of the stream after this op.
if (!launchOp.asyncToken() && !launchOp.asyncDependencies().empty())
return rewriter.notifyMatchFailure(
launchOp, "Cannot convert non-async op with async dependencies.");
Location loc = launchOp.getLoc();
// Create an LLVM global with CUBIN extracted from the kernel annotation and
// obtain a pointer to the first byte in it.
auto kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
launchOp, launchOp.getKernelModuleName());
assert(kernelModule && "expected a kernel module");
auto binaryAttr =
kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation);
if (!binaryAttr) {
kernelModule.emitOpError()
<< "missing " << gpuBinaryAnnotation << " attribute";
return failure();
}
SmallString<128> nameBuffer(kernelModule.getName());
nameBuffer.append(kGpuBinaryStorageSuffix);
Value data =
LLVM::createGlobalString(loc, rewriter, nameBuffer.str(),
binaryAttr.getValue(), LLVM::Linkage::Internal);
auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
// Get the function from the module. The name corresponds to the name of
// the kernel function.
auto kernelName = generateKernelNameConstant(
launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, rewriter);
auto function = moduleGetFunctionCallBuilder.create(
loc, rewriter, {module.getResult(0), kernelName});
auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
rewriter.getI32IntegerAttr(0));
auto adaptor =
gpu::LaunchFuncOpAdaptor(operands, launchOp->getAttrDictionary());
Value stream =
adaptor.asyncDependencies().empty()
? streamCreateCallBuilder.create(loc, rewriter, {}).getResult(0)
: adaptor.asyncDependencies().front();
// Create array of pointers to kernel arguments.
auto kernelParams = generateParamsArray(launchOp, operands, rewriter);
auto nullpointer = rewriter.create<LLVM::NullOp>(loc, llvmPointerPointerType);
launchKernelCallBuilder.create(loc, rewriter,
{function.getResult(0), launchOp.gridSizeX(),
launchOp.gridSizeY(), launchOp.gridSizeZ(),
launchOp.blockSizeX(), launchOp.blockSizeY(),
launchOp.blockSizeZ(),
/*sharedMemBytes=*/zero, stream, kernelParams,
/*extra=*/nullpointer});
if (launchOp.asyncToken()) {
// Async launch: make dependent ops use the same stream.
rewriter.replaceOp(launchOp, {stream});
} else {
// Synchronize with host and destroy stream. This must be the stream created
// above (with no other uses) because we check that the synchronous version
// does not have any async dependencies.
streamSynchronizeCallBuilder.create(loc, rewriter, stream);
streamDestroyCallBuilder.create(loc, rewriter, stream);
rewriter.eraseOp(launchOp);
}
moduleUnloadCallBuilder.create(loc, rewriter, module.getResult(0));
return success();
}
std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
mlir::createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) {
return std::make_unique<GpuToLLVMConversionPass>(gpuBinaryAnnotation);
}
void mlir::populateGpuToLLVMConversionPatterns(
LLVMTypeConverter &converter, OwningRewritePatternList &patterns,
StringRef gpuBinaryAnnotation) {
converter.addConversion(
[context = &converter.getContext()](gpu::AsyncTokenType type) -> Type {
return LLVM::LLVMType::getInt8PtrTy(context);
});
patterns.insert<ConvertAllocOpToGpuRuntimeCallPattern,
ConvertDeallocOpToGpuRuntimeCallPattern,
ConvertHostRegisterOpToGpuRuntimeCallPattern,
ConvertWaitAsyncOpToGpuRuntimeCallPattern,
ConvertWaitOpToGpuRuntimeCallPattern>(converter);
patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
converter, gpuBinaryAnnotation);
patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());
}