You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1025 lines
44 KiB

/*
* Copyright (C) 2021 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <android-base/logging.h>
#include <android-base/unique_fd.h>
#include <android/hardware_buffer.h>
#include <gtest/gtest.h>
#include <vulkan/vulkan.h>
#include <vulkan/vulkan_android.h>
#include <algorithm>
#include <cmath>
#include <cstring>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "TestNeuralNetworksWrapper.h"
#ifndef NNTEST_ONLY_PUBLIC_API
#include "Manager.h"
#endif
namespace android::nn {
namespace {
using Type = test_wrapper::Type;
using OperandType = test_wrapper::OperandType;
using Result = test_wrapper::Result;
constexpr uint32_t kOperandSizeX = 256;
constexpr uint32_t kOperandSizeY = 256;
constexpr uint32_t kOperandLength = kOperandSizeX * kOperandSizeY;
constexpr uint32_t kNumberOfIterationsToTest = 100;
constexpr uint32_t kMaxNumberOfPrintedErrors = 10;
// This file implements a test suite that exercises a GPU -> NNAPI pipeline using AHardwareBuffer
// and sync fence. One pass of the pipeline involves the following three stages:
//
// - GPU: Invoke the compute shader to clear the all elements in the output buffer to value "1"
// of the corresponding element type. Because GPU may not be able to natively support
// float16/int8/uint8 data types, we pack each data type into a 4-byte chunk as uint32_t
// and pass to the shader. E.g., float16 will be packed as 0x3c003c00 -- float16 value
// of "1" (0x3c00) repeated twice. The compute shader will use this 4-byte chunk to clear
// the data in the output buffer (see CLEAR_DATA in the compute shader code).
//
// The GPU workload will output directly to an AHardwareBuffer and export an Android sync
// fence.
//
// - NNAPI: Execute a broadcast ADD operation
//
// output = ADD(input, const, act)
//
// where "input" and "output" are of size [kOperandSizeY, kOperandSizeX], "const" and
// "act" are model constant operands, "const" is of size [1] and value "1" of the
// corresponding element type, "act" = 0. The ADD operation will increment each element
// in the input tensor by 1.
//
// The NNAPI executor takes the GPU output AHardwareBuffer as its input memory,
// and directly outputs to another AHardwareBuffer. We use startComputeWithDependencies
// to wait on the sync fence from the GPU workload. If supported, the NNAPI executor will
// emit a sync fence; Otherwise, it will wait until the workload is finished.
//
// - Check: Verify that each element in the resulting tensor is 1 + 1 = 2.
//
// We use introspection API to run the pipeline with each individual driver. Because this test is
// added in NNAPI feature level 5, we will exclude devices with a lower feature level. We expect
// that if the driver successfully prepares the model, it should finish execution without an error.
//
// The pipeline is tested with four data types: float32, float16, quant8_asymm, and
// quant8_asymm_signed. These data types are chosen to make sure that a driver is likely to
// support at least one of the data types.
//
// For each configuration, we run the pipeline for kNumberOfIterationsToTest iterations.
const std::vector<uint32_t> kComputeShader =
#include "shaders/TestGpuNnapi.comp.spv.inl"
;
// The expected element value in the final NNAPI output AHardwareBuffer.
constexpr uint32_t kExpectedResultInInt = 2;
// Helper templates for information related to a primary tensor data type. Only four specializations
// exists for this template: Type::TENSOR_FLOAT32, Type::TENSOR_FLOAT16, Type::TENSOR_QUANT8_ASYMM,
// and Type::TENSOR_QUANT8_ASYMM_SIGNED. Each specialization corresponds to a primary data type for
// the testing pipeline.
//
// Each template specialization defines the following fields:
// - ElementType: The corresponding C++ type. Use sizeof(ElementType) to get the element size.
// - kIsQuantized: Whether the data type is a quantized type or not.
// - kClearData: The CLEAR_DATA used in the compute shader.
template <Type dataType>
struct TestTypeHelper;
template <>
struct TestTypeHelper<Type::TENSOR_FLOAT32> {
using ElementType = float;
static constexpr bool kIsQuantized = false;
// One float32 of value (1.0) packed into uint32_t
static constexpr uint32_t kClearData = 0x3f800000;
};
template <>
struct TestTypeHelper<Type::TENSOR_FLOAT16> {
using ElementType = _Float16;
static constexpr bool kIsQuantized = false;
// Two float16 of value (1.0) packed into uint32_t
static constexpr uint32_t kClearData = 0x3c003c00;
};
template <>
struct TestTypeHelper<Type::TENSOR_QUANT8_ASYMM> {
using ElementType = uint8_t;
static constexpr bool kIsQuantized = true;
// Four uint8_t of value (1) packed into uint32_t
static constexpr uint32_t kClearData = 0x01010101;
};
template <>
struct TestTypeHelper<Type::TENSOR_QUANT8_ASYMM_SIGNED> {
using ElementType = int8_t;
static constexpr bool kIsQuantized = true;
// Four int8_t of value (1) packed into uint32_t
static constexpr uint32_t kClearData = 0x01010101;
};
bool isExtensionSupported(const std::vector<VkExtensionProperties>& supportedExtensions,
const char* requestedExtension) {
return std::any_of(supportedExtensions.begin(), supportedExtensions.end(),
[requestedExtension](const auto& extension) {
return strcmp(extension.extensionName, requestedExtension) == 0;
});
}
// Records the workgroup size and the group counts of dispatching the compute shader.
struct DispatchSize {
uint32_t workgroupSize;
uint32_t groupCountX;
uint32_t groupCountY;
};
// Choose an appropriate dispatch size. We are using a square workgroup size.
template <Type dataType>
DispatchSize chooseDispatchSize(const VkPhysicalDeviceLimits& limits) {
// Compute the number of invocations along each dimension.
const uint32_t elementSize = sizeof(typename TestTypeHelper<dataType>::ElementType);
const uint32_t numberOfElementsPerInvocation = sizeof(uint32_t) / elementSize;
const uint32_t workgroupInvocationsX = kOperandSizeX / numberOfElementsPerInvocation;
const uint32_t workgroupInvocationsY = kOperandSizeY;
// Make sure the workgroup size does not exceed the number of invocations along the X and Y
// dimensions.
uint32_t workgroupSize = std::min(workgroupInvocationsX, workgroupInvocationsY);
// Make sure the workgroup size does not exceed the device limit along the X and Y dimensions.
workgroupSize = std::min<uint32_t>(workgroupSize, limits.maxComputeWorkGroupSize[0]);
workgroupSize = std::min<uint32_t>(workgroupSize, limits.maxComputeWorkGroupSize[1]);
// Make sure the total number of invocations does not exceed the device limit.
uint32_t maxSquareWorkGroupSize =
static_cast<uint32_t>(std::sqrt(limits.maxComputeWorkGroupInvocations));
workgroupSize = std::min(workgroupSize, maxSquareWorkGroupSize);
// Round down to a power of 2. This is to make sure workgroupInvocationsX and
// workgroupInvocationsY are divisible by the workgroup size so that we don't need to apply
// bound check in the shader.
uint32_t power = static_cast<uint32_t>(std::log2(static_cast<float>(workgroupSize)));
workgroupSize = 1u << power;
CHECK(workgroupInvocationsX % workgroupSize == 0);
CHECK(workgroupInvocationsY % workgroupSize == 0);
return {
.workgroupSize = workgroupSize,
.groupCountX = workgroupInvocationsX / workgroupSize,
.groupCountY = workgroupInvocationsY / workgroupSize,
};
}
// Find the first memory index that satisfies the requirements
// See VkAndroidHardwareBufferPropertiesANDROID::memoryTypeBits for the semantics of
// "memoryTypeBitsRequirement"
std::optional<uint32_t> findMemoryType(const VkPhysicalDeviceMemoryProperties& properties,
uint32_t memoryTypeBitsRequirement,
VkDeviceSize sizeRequirement) {
for (uint32_t memoryIndex = 0; memoryIndex < VK_MAX_MEMORY_TYPES; ++memoryIndex) {
const uint32_t memoryTypeBits = (1 << memoryIndex);
const bool isRequiredMemoryType = memoryTypeBitsRequirement & memoryTypeBits;
const uint32_t heapIndex = properties.memoryTypes[memoryIndex].heapIndex;
const bool isLargeEnough = properties.memoryHeaps[heapIndex].size >= sizeRequirement;
if (isRequiredMemoryType && isLargeEnough) return memoryIndex;
}
// failed to find memory type.
return std::nullopt;
}
void addBufferTransitionBarrier(VkCommandBuffer commandBuffer, VkBuffer buffer,
VkPipelineStageFlags srcStageMask,
VkPipelineStageFlags dstStageMask, VkAccessFlags srcAccessMask,
VkAccessFlags dstAccessMask, uint32_t srcQueue, uint32_t dstQueue) {
const VkBufferMemoryBarrier bufferBarrier = {
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = srcAccessMask,
.dstAccessMask = dstAccessMask,
.srcQueueFamilyIndex = srcQueue,
.dstQueueFamilyIndex = dstQueue,
.buffer = buffer,
.offset = 0,
.size = VK_WHOLE_SIZE,
};
vkCmdPipelineBarrier(commandBuffer, srcStageMask, dstStageMask, 0, 0, nullptr, 1,
&bufferBarrier, 0, nullptr);
}
void allocateBlobAhwb(uint32_t size, uint64_t usage, AHardwareBuffer** outAhwb) {
AHardwareBuffer_Desc desc = {
.width = size,
.height = 1u,
.layers = 1u,
.format = AHARDWAREBUFFER_FORMAT_BLOB,
.usage = usage,
};
ASSERT_EQ(AHardwareBuffer_allocate(&desc, outAhwb), 0);
}
using NameAndDevice = std::pair<const char*, const ANeuralNetworksDevice*>;
void getNnapiDevices(std::vector<NameAndDevice>* outDevices) {
// Get the number of available NNAPI devices
uint32_t numDevices = 0;
ASSERT_EQ(ANeuralNetworks_getDeviceCount(&numDevices), ANEURALNETWORKS_NO_ERROR);
std::vector<NameAndDevice> devices;
for (uint32_t i = 0; i < numDevices; i++) {
// Get device
ANeuralNetworksDevice* device;
ASSERT_EQ(ANeuralNetworks_getDevice(/*devIndex=*/i, &device), ANEURALNETWORKS_NO_ERROR);
// Get device name
const char* deviceName = nullptr;
ASSERT_EQ(ANeuralNetworksDevice_getName(device, &deviceName), ANEURALNETWORKS_NO_ERROR);
// Check device feature level. This test is added in NNAPI feature level 5, so skip if the
// device is of a lower feature level.
int64_t featureLevel;
ASSERT_EQ(ANeuralNetworksDevice_getFeatureLevel(device, &featureLevel),
ANEURALNETWORKS_NO_ERROR);
if (featureLevel < ANEURALNETWORKS_FEATURE_LEVEL_5) {
continue;
}
devices.emplace_back(deviceName, device);
}
*outDevices = std::move(devices);
}
std::vector<NameAndDevice> getNnapiDevices() {
std::vector<NameAndDevice> devices;
getNnapiDevices(&devices);
return devices;
}
std::string printGpuNnapiTest(const testing::TestParamInfo<NameAndDevice>& info) {
std::string name = info.param.first;
// gtest test names must only contain alphanumeric characters
std::replace_if(
name.begin(), name.end(), [](char c) { return !std::isalnum(c); }, '_');
return name;
}
template <Type dataType>
class VulkanComputePipeline {
public:
// Returns the created object on success, or nullptr on failure.
static std::unique_ptr<VulkanComputePipeline> create(AHardwareBuffer* output) {
auto pipeline = std::make_unique<VulkanComputePipeline>();
pipeline->initialize(output);
return pipeline->mIsValid ? std::move(pipeline) : nullptr;
}
~VulkanComputePipeline() {
if (mDevice != VK_NULL_HANDLE) {
vkDestroyFence(mDevice, mFence, nullptr);
vkDestroyPipeline(mDevice, mPipeline, nullptr);
vkDestroyDescriptorSetLayout(mDevice, mDescriptorSetLayout, nullptr);
vkDestroyPipelineLayout(mDevice, mPipelineLayout, nullptr);
vkFreeMemory(mDevice, mOutputBufferMemory, nullptr);
vkDestroyBuffer(mDevice, mOutputBuffer, nullptr);
vkDestroyShaderModule(mDevice, mShaderModule, nullptr);
vkDestroyCommandPool(mDevice, mCommandPool, nullptr);
vkDestroyDescriptorPool(mDevice, mDescriptorPool, nullptr);
}
vkDestroyDevice(mDevice, nullptr);
vkDestroyInstance(mInstance, nullptr);
}
// Returns {success, sync_fd}
std::pair<bool, base::unique_fd> run() {
bool success = false;
base::unique_fd outSyncFd;
runInternal(&success, &outSyncFd);
return {success, std::move(outSyncFd)};
}
private:
void initialize(AHardwareBuffer* output) {
// Create instance
const VkApplicationInfo applicationDesc = {
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pApplicationName = "TestGpuNnapi",
.applicationVersion = VK_MAKE_VERSION(1, 0, 0),
.apiVersion = VK_API_VERSION_1_1,
};
const VkInstanceCreateInfo instanceDesc = {
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pApplicationInfo = &applicationDesc,
.enabledLayerCount = 0,
.ppEnabledLayerNames = nullptr,
.enabledExtensionCount = 0,
.ppEnabledExtensionNames = nullptr,
};
ASSERT_EQ(vkCreateInstance(&instanceDesc, nullptr, &mInstance), VK_SUCCESS);
// Enumerate physical devices
uint32_t numberOfDevices = 0;
ASSERT_EQ(vkEnumeratePhysicalDevices(mInstance, &numberOfDevices, nullptr), VK_SUCCESS);
std::vector<VkPhysicalDevice> physicalDevices(numberOfDevices);
ASSERT_EQ(vkEnumeratePhysicalDevices(mInstance, &numberOfDevices, physicalDevices.data()),
VK_SUCCESS);
// Pick the first device with a compute queue
for (const auto& physicalDevice : physicalDevices) {
uint32_t numberOfQueueFamilies = 0;
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &numberOfQueueFamilies,
nullptr);
std::vector<VkQueueFamilyProperties> queueFamilies(numberOfQueueFamilies);
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &numberOfQueueFamilies,
queueFamilies.data());
uint32_t pickedQueueFamilyIndex = 0;
bool hasComputeQueue = false;
for (uint32_t i = 0; i < queueFamilies.size(); i++) {
if (queueFamilies[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
pickedQueueFamilyIndex = i;
hasComputeQueue = true;
break;
}
}
if (!hasComputeQueue) continue;
mPhysicalDevice = physicalDevice;
mQueueFamilyIndex = pickedQueueFamilyIndex;
break;
}
if (mPhysicalDevice == VK_NULL_HANDLE) {
GTEST_SKIP() << "No device can handle a compute queue";
}
// Get physical device properties
vkGetPhysicalDeviceProperties(mPhysicalDevice, &mPhysicalDeviceProperties);
vkGetPhysicalDeviceMemoryProperties(mPhysicalDevice, &mPhysicalDeviceMemoryProperties);
// Check physical device version
if (mPhysicalDeviceProperties.apiVersion < VK_API_VERSION_1_1) {
GTEST_SKIP() << "Device API version too low";
}
// Check if the physical device is able to handle the compute work
const auto dispatchSize = chooseDispatchSize<dataType>(mPhysicalDeviceProperties.limits);
if (mPhysicalDeviceProperties.limits.maxComputeWorkGroupCount[0] <
dispatchSize.groupCountX) {
GTEST_SKIP() << "Device cannot handle " << dispatchSize.groupCountX
<< " workgroups for the X dimension";
}
if (mPhysicalDeviceProperties.limits.maxComputeWorkGroupCount[1] <
dispatchSize.groupCountY) {
GTEST_SKIP() << "Device cannot handle " << dispatchSize.groupCountY
<< " workgroups for the Y dimension";
}
// Enumerate device extensions
uint32_t numberOfExtensions = 0;
ASSERT_EQ(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr,
&numberOfExtensions, nullptr),
VK_SUCCESS);
std::vector<VkExtensionProperties> extensions(numberOfExtensions);
ASSERT_EQ(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr,
&numberOfExtensions, extensions.data()),
VK_SUCCESS);
// Required device extensions
std::vector<const char*> requiredDeviceExtensions = {
// The following extensions are required to import an AHardwareBuffer to Vulkan
VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_EXTENSION_NAME,
VK_EXT_QUEUE_FAMILY_FOREIGN_EXTENSION_NAME,
VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME,
VK_KHR_BIND_MEMORY_2_EXTENSION_NAME,
VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
// The following extensions are required to export a sync fence
VK_KHR_EXTERNAL_FENCE_FD_EXTENSION_NAME,
VK_KHR_MAINTENANCE1_EXTENSION_NAME,
};
for (const char* requiredDeviceExtension : requiredDeviceExtensions) {
if (!isExtensionSupported(extensions, requiredDeviceExtension)) {
GTEST_SKIP() << "Device extension " << requiredDeviceExtension
<< " is not supported";
}
}
// Check external memory properties
const VkPhysicalDeviceExternalBufferInfo externalBufferInfo = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO,
.pNext = nullptr,
.flags = 0u,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
};
VkExternalBufferProperties externalBufferProperties;
vkGetPhysicalDeviceExternalBufferProperties(mPhysicalDevice, &externalBufferInfo,
&externalBufferProperties);
if (!(externalBufferProperties.externalMemoryProperties.externalMemoryFeatures &
VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT)) {
GTEST_SKIP() << "Device is not able to import Android hardware buffer";
}
ASSERT_FALSE(externalBufferProperties.externalMemoryProperties.externalMemoryFeatures &
VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT);
// Check external fence properties
const VkPhysicalDeviceExternalFenceInfo externalFenceInfo = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_FENCE_INFO,
.pNext = nullptr,
.handleType = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
};
VkExternalFenceProperties externalFenceProperties;
vkGetPhysicalDeviceExternalFenceProperties(mPhysicalDevice, &externalFenceInfo,
&externalFenceProperties);
if (!(externalFenceProperties.externalFenceFeatures &
VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT)) {
GTEST_SKIP() << "Device is not able to export Android sync fence FD";
}
// Create logical device
const float queuePriority = 1.0f;
const VkDeviceQueueCreateInfo queueDesc = {
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
.queueFamilyIndex = mQueueFamilyIndex,
.queueCount = 1,
.pQueuePriorities = &queuePriority,
};
const VkDeviceCreateInfo deviceDesc = {
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
.queueCreateInfoCount = 1,
.pQueueCreateInfos = &queueDesc,
.enabledExtensionCount = static_cast<uint32_t>(requiredDeviceExtensions.size()),
.ppEnabledExtensionNames = requiredDeviceExtensions.data(),
.pEnabledFeatures = nullptr,
};
ASSERT_EQ(vkCreateDevice(mPhysicalDevice, &deviceDesc, nullptr, &mDevice), VK_SUCCESS);
vkGetDeviceQueue(mDevice, mQueueFamilyIndex, 0, &mQueue);
// Get extension function pointers
mPfnVkGetFenceFdKHR = reinterpret_cast<PFN_vkGetFenceFdKHR>(
vkGetDeviceProcAddr(mDevice, "vkGetFenceFdKHR"));
ASSERT_NE(mPfnVkGetFenceFdKHR, nullptr);
// Create descriptor pool
const std::vector<VkDescriptorPoolSize> descriptorPoolSizes = {
{
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
},
};
const VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = {
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
.maxSets = 1,
.poolSizeCount = static_cast<uint32_t>(descriptorPoolSizes.size()),
.pPoolSizes = descriptorPoolSizes.data(),
};
ASSERT_EQ(vkCreateDescriptorPool(mDevice, &descriptorPoolCreateInfo, nullptr,
&mDescriptorPool),
VK_SUCCESS);
// Create descriptor set layout
const std::vector<VkDescriptorSetLayoutBinding> descriptorsetLayoutBinding = {
{
.binding = 0, // output buffer
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
},
};
const VkDescriptorSetLayoutCreateInfo descriptorsetLayoutDesc = {
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
.bindingCount = static_cast<uint32_t>(descriptorsetLayoutBinding.size()),
.pBindings = descriptorsetLayoutBinding.data(),
};
ASSERT_EQ(vkCreateDescriptorSetLayout(mDevice, &descriptorsetLayoutDesc, nullptr,
&mDescriptorSetLayout),
VK_SUCCESS);
// Allocate descriptor set
const VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = {
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
.descriptorPool = mDescriptorPool,
.descriptorSetCount = 1,
.pSetLayouts = &mDescriptorSetLayout,
};
ASSERT_EQ(vkAllocateDescriptorSets(mDevice, &descriptorSetAllocateInfo, &mDescriptorSet),
VK_SUCCESS);
// Check the output AHardwareBuffer format and usage bits
AHardwareBuffer_Desc desc;
AHardwareBuffer_describe(output, &desc);
ASSERT_EQ(desc.format, AHARDWAREBUFFER_FORMAT_BLOB);
ASSERT_TRUE(desc.usage & AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER);
// Get AHardwareBuffer properties
VkAndroidHardwareBufferPropertiesANDROID properties = {
.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_PROPERTIES_ANDROID,
.pNext = nullptr,
};
ASSERT_EQ(vkGetAndroidHardwareBufferPropertiesANDROID(mDevice, output, &properties),
VK_SUCCESS);
// Create the output buffer with AHardwareBuffer memory
const VkExternalMemoryBufferCreateInfo externalMemoryBufferCreateInfo = {
.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO,
.pNext = nullptr,
.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
};
const VkBufferCreateInfo bufferCreateInfo = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = &externalMemoryBufferCreateInfo,
.flags = 0u,
.size = desc.width,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0u,
.pQueueFamilyIndices = nullptr,
};
ASSERT_EQ(vkCreateBuffer(mDevice, &bufferCreateInfo, nullptr, &mOutputBuffer), VK_SUCCESS);
// Find a proper memory type
const auto maybeMemoryTypeIndex =
findMemoryType(mPhysicalDeviceMemoryProperties, properties.memoryTypeBits,
properties.allocationSize);
if (!maybeMemoryTypeIndex.has_value()) {
GTEST_SKIP() << "None of the memory type is suitable for allocation";
}
// Import the AHardwareBuffer memory
const VkImportAndroidHardwareBufferInfoANDROID importMemoryAllocateInfo = {
.sType = VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID,
.pNext = nullptr,
.buffer = output,
};
const VkMemoryAllocateInfo memoryAllocInfo = {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.pNext = &importMemoryAllocateInfo,
.allocationSize = properties.allocationSize,
.memoryTypeIndex = maybeMemoryTypeIndex.value(),
};
const auto allocationResult =
vkAllocateMemory(mDevice, &memoryAllocInfo, nullptr, &mOutputBufferMemory);
// Memory allocation may fail if the size exceeds the upper limit of a single allocation
// that the platform supports
if (allocationResult == VK_ERROR_OUT_OF_DEVICE_MEMORY) {
GTEST_SKIP() << "Unable to allocate device memory of " << properties.allocationSize
<< " bytes";
}
ASSERT_EQ(allocationResult, VK_SUCCESS);
// Bind the memory with the buffer
ASSERT_EQ(vkBindBufferMemory(mDevice, mOutputBuffer, mOutputBufferMemory, 0), VK_SUCCESS);
// Update the descriptor sets
const VkDescriptorBufferInfo outputBufferDesc = {
.buffer = mOutputBuffer,
.offset = 0,
.range = VK_WHOLE_SIZE,
};
const std::vector<VkWriteDescriptorSet> writeDst = {
{
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
.pNext = nullptr,
.dstSet = mDescriptorSet,
.dstBinding = 0, // output buffer
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.pImageInfo = nullptr,
.pBufferInfo = &outputBufferDesc,
.pTexelBufferView = nullptr,
},
};
vkUpdateDescriptorSets(mDevice, writeDst.size(), writeDst.data(), 0, nullptr);
// Create shader module
const VkShaderModuleCreateInfo shaderDesc = {
.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
.flags = 0,
.codeSize = kComputeShader.size() * sizeof(uint32_t),
.pCode = kComputeShader.data(),
};
ASSERT_EQ(vkCreateShaderModule(mDevice, &shaderDesc, nullptr, &mShaderModule), VK_SUCCESS);
// Create pipeline layout
const VkPipelineLayoutCreateInfo layoutDesc = {
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
.setLayoutCount = 1,
.pSetLayouts = &mDescriptorSetLayout,
.pushConstantRangeCount = 0,
.pPushConstantRanges = nullptr,
};
ASSERT_EQ(vkCreatePipelineLayout(mDevice, &layoutDesc, nullptr, &mPipelineLayout),
VK_SUCCESS);
// Create compute pipeline
const uint32_t specializationData[] = {
dispatchSize.workgroupSize, // local_size_x
dispatchSize.workgroupSize, // local_size_y
TestTypeHelper<dataType>::kClearData, // CLEAR_DATA
};
const std::vector<VkSpecializationMapEntry> specializationMap = {
// {constantID, offset, size}
{0, 0 * sizeof(uint32_t), sizeof(uint32_t)},
{1, 1 * sizeof(uint32_t), sizeof(uint32_t)},
{2, 2 * sizeof(uint32_t), sizeof(uint32_t)},
};
const VkSpecializationInfo specializationInfo = {
.mapEntryCount = static_cast<uint32_t>(specializationMap.size()),
.pMapEntries = specializationMap.data(),
.dataSize = sizeof(specializationData),
.pData = specializationData,
};
const VkComputePipelineCreateInfo pipelineDesc = {
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.stage =
{
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = mShaderModule,
.pName = "main",
.pSpecializationInfo = &specializationInfo,
},
.layout = mPipelineLayout,
};
ASSERT_EQ(vkCreateComputePipelines(mDevice, VK_NULL_HANDLE, 1, &pipelineDesc, nullptr,
&mPipeline),
VK_SUCCESS);
// Create command pool
const VkCommandPoolCreateInfo cmdpoolDesc = {
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
.flags = 0u,
.queueFamilyIndex = mQueueFamilyIndex,
};
ASSERT_EQ(vkCreateCommandPool(mDevice, &cmdpoolDesc, nullptr, &mCommandPool), VK_SUCCESS);
// Create a command buffer
const VkCommandBufferAllocateInfo cmdBufferCreateInfo = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
.pNext = nullptr,
.commandPool = mCommandPool,
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
.commandBufferCount = 1,
};
ASSERT_EQ(vkAllocateCommandBuffers(mDevice, &cmdBufferCreateInfo, &mCommandBuffer),
VK_SUCCESS);
// Record command buffer
const VkCommandBufferBeginInfo commandBufferBeginInfo = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
.pNext = nullptr,
.flags = 0,
.pInheritanceInfo = nullptr,
};
ASSERT_EQ(vkBeginCommandBuffer(mCommandBuffer, &commandBufferBeginInfo), VK_SUCCESS);
// Buffer barrier to acquire the ownership of the output buffer
addBufferTransitionBarrier(mCommandBuffer, mOutputBuffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
VK_ACCESS_SHADER_WRITE_BIT, VK_QUEUE_FAMILY_FOREIGN_EXT,
mQueueFamilyIndex);
// Setup resources
vkCmdBindPipeline(mCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipeline);
vkCmdBindDescriptorSets(mCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipelineLayout, 0,
1, &mDescriptorSet, 0, nullptr);
// Dispatch compute
vkCmdDispatch(mCommandBuffer, dispatchSize.groupCountX, dispatchSize.groupCountY, 1);
// Buffer barrier to release the ownership of the output buffer
addBufferTransitionBarrier(mCommandBuffer, mOutputBuffer,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_ACCESS_SHADER_WRITE_BIT,
0, mQueueFamilyIndex, VK_QUEUE_FAMILY_FOREIGN_EXT);
// Finish recording the command buffer
ASSERT_EQ(vkEndCommandBuffer(mCommandBuffer), VK_SUCCESS);
// Create fence
const VkExportFenceCreateInfo exportFenceCreateInfo = {
.sType = VK_STRUCTURE_TYPE_EXPORT_FENCE_CREATE_INFO,
.pNext = nullptr,
.handleTypes = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
};
const VkFenceCreateInfo fenceCreateInfo = {
.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
.pNext = &exportFenceCreateInfo,
.flags = 0,
};
ASSERT_EQ(vkCreateFence(mDevice, &fenceCreateInfo, nullptr, &mFence), VK_SUCCESS);
mIsValid = true;
}
void runInternal(bool* outSuccess, base::unique_fd* outSyncFd) {
*outSuccess = false;
// Submit to queue
const VkSubmitInfo submitInfo = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
.waitSemaphoreCount = 0,
.pWaitSemaphores = nullptr,
.pWaitDstStageMask = nullptr,
.commandBufferCount = 1,
.pCommandBuffers = &mCommandBuffer,
.signalSemaphoreCount = 0,
.pSignalSemaphores = nullptr,
};
ASSERT_EQ(vkResetFences(mDevice, 1, &mFence), VK_SUCCESS);
ASSERT_EQ(vkQueueSubmit(mQueue, 1, &submitInfo, mFence), VK_SUCCESS);
// Export a Android sync fence FD
int syncFd = -1;
const VkFenceGetFdInfoKHR fenceGetFdInfo = {
.sType = VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR,
.pNext = nullptr,
.fence = mFence,
.handleType = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
};
ASSERT_EQ(mPfnVkGetFenceFdKHR(mDevice, &fenceGetFdInfo, &syncFd), VK_SUCCESS);
*outSyncFd = base::unique_fd(syncFd);
*outSuccess = true;
}
// Instance
VkInstance mInstance = VK_NULL_HANDLE;
// Physical device and queue family
VkPhysicalDevice mPhysicalDevice = VK_NULL_HANDLE;
VkPhysicalDeviceProperties mPhysicalDeviceProperties{};
VkPhysicalDeviceMemoryProperties mPhysicalDeviceMemoryProperties{};
uint32_t mQueueFamilyIndex = 0;
// Logical device and queue
VkDevice mDevice = VK_NULL_HANDLE;
VkQueue mQueue = VK_NULL_HANDLE;
// Extension functions
PFN_vkGetFenceFdKHR mPfnVkGetFenceFdKHR = nullptr;
// Resource descriptors
VkDescriptorPool mDescriptorPool = VK_NULL_HANDLE;
VkDescriptorSetLayout mDescriptorSetLayout = VK_NULL_HANDLE;
VkDescriptorSet mDescriptorSet = VK_NULL_HANDLE;
// Output buffer
VkBuffer mOutputBuffer = VK_NULL_HANDLE;
VkDeviceMemory mOutputBufferMemory = VK_NULL_HANDLE;
// Compute pipeline
VkShaderModule mShaderModule = VK_NULL_HANDLE;
VkPipelineLayout mPipelineLayout = VK_NULL_HANDLE;
VkPipeline mPipeline = VK_NULL_HANDLE;
// Command buffer
VkCommandPool mCommandPool = VK_NULL_HANDLE;
VkCommandBuffer mCommandBuffer = VK_NULL_HANDLE;
VkFence mFence = VK_NULL_HANDLE;
bool mIsValid = false;
};
template <Type dataType>
class NnapiExecutor {
public:
// Returns the created object on success, or nullptr on failure.
static std::unique_ptr<NnapiExecutor> create(const ANeuralNetworksDevice* device,
AHardwareBuffer* input, AHardwareBuffer* output) {
auto nnapi = std::make_unique<NnapiExecutor>(input, output);
nnapi->initialize(device);
return nnapi->mIsValid ? std::move(nnapi) : nullptr;
}
// Prefer NnapiExecutor::create
NnapiExecutor(AHardwareBuffer* input, AHardwareBuffer* output)
: mInputMemory(input), mOutputMemory(output) {}
// Returns {success, sync_fd}
std::pair<bool, base::unique_fd> run(const base::unique_fd& inSyncFd) {
bool success = false;
base::unique_fd outSyncFd;
runInternal(inSyncFd, &success, &outSyncFd);
return {success, std::move(outSyncFd)};
}
private:
using ElementType = typename TestTypeHelper<dataType>::ElementType;
void initialize(const ANeuralNetworksDevice* device) {
ASSERT_TRUE(mInputMemory.isValid());
ASSERT_TRUE(mOutputMemory.isValid());
// Model input
const float scale = TestTypeHelper<dataType>::kIsQuantized ? 1.0f : 0.0f;
const OperandType tensorType(dataType, {kOperandSizeY, kOperandSizeX}, scale,
/*zeroPoint=*/0);
uint32_t inputTensor = mModel.addOperand(&tensorType);
// Constant tensor
const OperandType constTensorType(dataType, {1}, scale, /*zeroPoint=*/0);
const ElementType constTensorData = static_cast<ElementType>(1);
uint32_t constTensor =
mModel.addConstantOperand<ElementType>(&constTensorType, constTensorData);
// Activation (NONE)
const OperandType activationType(Type::INT32, {});
uint32_t activationScalar = mModel.addConstantOperand<int32_t>(&activationType, 0);
// Model output
uint32_t outputTensor = mModel.addOperand(&tensorType);
// Model operation
mModel.addOperation(ANEURALNETWORKS_ADD, {inputTensor, constTensor, activationScalar},
{outputTensor});
// Finish model
mModel.identifyInputsAndOutputs({inputTensor}, {outputTensor});
mModel.relaxComputationFloat32toFloat16(/*isRelax=*/true);
ASSERT_TRUE(mModel.isValid());
ASSERT_EQ(mModel.finish(), Result::NO_ERROR);
// Create compilation for the target device
Result result;
std::tie(result, mCompilation) =
test_wrapper::Compilation::createForDevice(&mModel, device);
ASSERT_EQ(result, Result::NO_ERROR);
// Finish the compilation
result = mCompilation.finish();
if (result != Result::NO_ERROR) {
GTEST_SKIP() << "Model is not supported by the device";
}
mIsValid = true;
}
void runInternal(const base::unique_fd& inSyncFd, bool* outSuccess,
base::unique_fd* outSyncFd) {
*outSuccess = false;
// Setup execution
mExecution = std::make_unique<test_wrapper::Execution>(&mCompilation);
ASSERT_EQ(mExecution->setInputFromMemory(/*index=*/0, &mInputMemory, /*offset=*/0,
kOperandLength * sizeof(ElementType)),
Result::NO_ERROR);
ASSERT_EQ(mExecution->setOutputFromMemory(/*index=*/0, &mOutputMemory, /*offset=*/0,
kOperandLength * sizeof(ElementType)),
Result::NO_ERROR);
// Setup dependencies
std::vector<const test_wrapper::Event*> dependencies;
test_wrapper::Event start;
// The sync fence from Vulkan may not be valid if GPU workload has already finished
// prior to exporting the fence.
if (inSyncFd.ok()) {
start = test_wrapper::Event(inSyncFd.get());
ASSERT_TRUE(start.isValid());
dependencies = {&start};
}
// Fenced compute
test_wrapper::Event finished;
mExecution->startComputeWithDependencies(dependencies, /*infinite timeout*/ 0, &finished);
// Get the output sync fence if supported; Otherwise, wait until the execution is finished
int syncFd = -1;
finished.getSyncFenceFd(&syncFd);
if (syncFd == -1) {
ASSERT_EQ(finished.wait(), Result::NO_ERROR);
}
*outSyncFd = base::unique_fd(syncFd);
*outSuccess = true;
}
test_wrapper::Model mModel;
test_wrapper::Compilation mCompilation;
std::unique_ptr<test_wrapper::Execution> mExecution;
test_wrapper::Memory mInputMemory, mOutputMemory;
bool mIsValid = false;
};
class GpuNnapiTest : public testing::TestWithParam<NameAndDevice> {
protected:
void TearDown() override {
if (mGpuOutput) {
AHardwareBuffer_release(mGpuOutput);
}
if (mNnapiOutput) {
AHardwareBuffer_release(mNnapiOutput);
}
}
template <Type dataType>
void runTest() {
#ifndef NNTEST_ONLY_PUBLIC_API
if (DeviceManager::get()->getUseCpuOnly()) {
GTEST_SKIP();
}
#endif
// Allocate hardware buffers for GPU and NNAPI outputs
const size_t size = kOperandLength * sizeof(typename TestTypeHelper<dataType>::ElementType);
allocateBlobAhwb(
size, AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER | AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN,
&mGpuOutput);
allocateBlobAhwb(
size, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN,
&mNnapiOutput);
if (mGpuOutput == nullptr || mNnapiOutput == nullptr) return;
// Create Vulkan compute pipeline
auto vulkan = VulkanComputePipeline<dataType>::create(mGpuOutput);
if (vulkan == nullptr) return;
// Create NNAPI executor
auto nnapi = NnapiExecutor<dataType>::create(kDevice, mGpuOutput, mNnapiOutput);
if (nnapi == nullptr) return;
// Run the test repeatly for kNumberOfIterationsToTest iterations
for (uint32_t i = 0; i < kNumberOfIterationsToTest; i++) {
auto [gpuSuccess, gpuSyncFd] = vulkan->run();
ASSERT_TRUE(gpuSuccess);
auto [nnapiSuccess, nnapiSyncFd] = nnapi->run(gpuSyncFd);
ASSERT_TRUE(nnapiSuccess);
checkResults<dataType>(std::move(nnapiSyncFd));
}
}
template <Type dataType>
void checkResults(base::unique_fd syncFd) {
using ElementType = typename TestTypeHelper<dataType>::ElementType;
// Lock the buffer with the sync fence
// AHardwareBuffer_lock will take the ownership and close the sync fence even on errors
void* data;
ASSERT_EQ(AHardwareBuffer_lock(mNnapiOutput, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN,
syncFd.release(), /*rect=*/nullptr, &data),
0);
// Compare the actual results with the expect value
uint32_t numberOfErrors = 0;
const ElementType expected = static_cast<ElementType>(kExpectedResultInInt);
for (uint32_t i = 0; i < kOperandLength; i++) {
const ElementType actual = reinterpret_cast<ElementType*>(data)[i];
// We expect bit-exact here because the arithmetic is trivial, and all intermediate
// and final results can be exactly represented by the primary data type.
if (actual != expected) {
// Print at most kMaxNumberOfPrintedErrors errors by EXPECT_EQ
if (numberOfErrors < kMaxNumberOfPrintedErrors) {
EXPECT_EQ(actual, expected)
<< "When comparing element [" << kOperandLength / kOperandSizeX << ", "
<< kOperandLength % kOperandSizeX << "]";
}
numberOfErrors++;
}
}
EXPECT_EQ(numberOfErrors, 0u);
ASSERT_EQ(AHardwareBuffer_unlock(mNnapiOutput, /*fence=*/nullptr), 0);
}
// The NNAPI device under test
const ANeuralNetworksDevice* kDevice = GetParam().second;
AHardwareBuffer* mGpuOutput = nullptr;
AHardwareBuffer* mNnapiOutput = nullptr;
};
TEST_P(GpuNnapiTest, Float32) {
runTest<Type::TENSOR_FLOAT32>();
}
TEST_P(GpuNnapiTest, Float16) {
runTest<Type::TENSOR_FLOAT16>();
}
TEST_P(GpuNnapiTest, Quant8Asymm) {
runTest<Type::TENSOR_QUANT8_ASYMM>();
}
TEST_P(GpuNnapiTest, Quant8AsymmSigned) {
runTest<Type::TENSOR_QUANT8_ASYMM_SIGNED>();
}
INSTANTIATE_TEST_SUITE_P(TestGpuNnapi, GpuNnapiTest, testing::ValuesIn(getNnapiDevices()),
printGpuNnapiTest);
} // namespace
} // namespace android::nn