You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1025 lines
44 KiB
1025 lines
44 KiB
/*
|
|
* Copyright (C) 2021 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include <android-base/logging.h>
|
|
#include <android-base/unique_fd.h>
|
|
#include <android/hardware_buffer.h>
|
|
#include <gtest/gtest.h>
|
|
#include <vulkan/vulkan.h>
|
|
#include <vulkan/vulkan_android.h>
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <cstring>
|
|
#include <memory>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "TestNeuralNetworksWrapper.h"
|
|
|
|
#ifndef NNTEST_ONLY_PUBLIC_API
|
|
#include "Manager.h"
|
|
#endif
|
|
|
|
namespace android::nn {
|
|
namespace {
|
|
|
|
using Type = test_wrapper::Type;
|
|
using OperandType = test_wrapper::OperandType;
|
|
using Result = test_wrapper::Result;
|
|
|
|
constexpr uint32_t kOperandSizeX = 256;
|
|
constexpr uint32_t kOperandSizeY = 256;
|
|
constexpr uint32_t kOperandLength = kOperandSizeX * kOperandSizeY;
|
|
constexpr uint32_t kNumberOfIterationsToTest = 100;
|
|
constexpr uint32_t kMaxNumberOfPrintedErrors = 10;
|
|
|
|
// This file implements a test suite that exercises a GPU -> NNAPI pipeline using AHardwareBuffer
|
|
// and sync fence. One pass of the pipeline involves the following three stages:
|
|
//
|
|
// - GPU: Invoke the compute shader to clear the all elements in the output buffer to value "1"
|
|
// of the corresponding element type. Because GPU may not be able to natively support
|
|
// float16/int8/uint8 data types, we pack each data type into a 4-byte chunk as uint32_t
|
|
// and pass to the shader. E.g., float16 will be packed as 0x3c003c00 -- float16 value
|
|
// of "1" (0x3c00) repeated twice. The compute shader will use this 4-byte chunk to clear
|
|
// the data in the output buffer (see CLEAR_DATA in the compute shader code).
|
|
//
|
|
// The GPU workload will output directly to an AHardwareBuffer and export an Android sync
|
|
// fence.
|
|
//
|
|
// - NNAPI: Execute a broadcast ADD operation
|
|
//
|
|
// output = ADD(input, const, act)
|
|
//
|
|
// where "input" and "output" are of size [kOperandSizeY, kOperandSizeX], "const" and
|
|
// "act" are model constant operands, "const" is of size [1] and value "1" of the
|
|
// corresponding element type, "act" = 0. The ADD operation will increment each element
|
|
// in the input tensor by 1.
|
|
//
|
|
// The NNAPI executor takes the GPU output AHardwareBuffer as its input memory,
|
|
// and directly outputs to another AHardwareBuffer. We use startComputeWithDependencies
|
|
// to wait on the sync fence from the GPU workload. If supported, the NNAPI executor will
|
|
// emit a sync fence; Otherwise, it will wait until the workload is finished.
|
|
//
|
|
// - Check: Verify that each element in the resulting tensor is 1 + 1 = 2.
|
|
//
|
|
// We use introspection API to run the pipeline with each individual driver. Because this test is
|
|
// added in NNAPI feature level 5, we will exclude devices with a lower feature level. We expect
|
|
// that if the driver successfully prepares the model, it should finish execution without an error.
|
|
//
|
|
// The pipeline is tested with four data types: float32, float16, quant8_asymm, and
|
|
// quant8_asymm_signed. These data types are chosen to make sure that a driver is likely to
|
|
// support at least one of the data types.
|
|
//
|
|
// For each configuration, we run the pipeline for kNumberOfIterationsToTest iterations.
|
|
|
|
const std::vector<uint32_t> kComputeShader =
|
|
#include "shaders/TestGpuNnapi.comp.spv.inl"
|
|
;
|
|
|
|
// The expected element value in the final NNAPI output AHardwareBuffer.
|
|
constexpr uint32_t kExpectedResultInInt = 2;
|
|
|
|
// Helper templates for information related to a primary tensor data type. Only four specializations
|
|
// exists for this template: Type::TENSOR_FLOAT32, Type::TENSOR_FLOAT16, Type::TENSOR_QUANT8_ASYMM,
|
|
// and Type::TENSOR_QUANT8_ASYMM_SIGNED. Each specialization corresponds to a primary data type for
|
|
// the testing pipeline.
|
|
//
|
|
// Each template specialization defines the following fields:
|
|
// - ElementType: The corresponding C++ type. Use sizeof(ElementType) to get the element size.
|
|
// - kIsQuantized: Whether the data type is a quantized type or not.
|
|
// - kClearData: The CLEAR_DATA used in the compute shader.
|
|
template <Type dataType>
|
|
struct TestTypeHelper;
|
|
template <>
|
|
struct TestTypeHelper<Type::TENSOR_FLOAT32> {
|
|
using ElementType = float;
|
|
static constexpr bool kIsQuantized = false;
|
|
// One float32 of value (1.0) packed into uint32_t
|
|
static constexpr uint32_t kClearData = 0x3f800000;
|
|
};
|
|
template <>
|
|
struct TestTypeHelper<Type::TENSOR_FLOAT16> {
|
|
using ElementType = _Float16;
|
|
static constexpr bool kIsQuantized = false;
|
|
// Two float16 of value (1.0) packed into uint32_t
|
|
static constexpr uint32_t kClearData = 0x3c003c00;
|
|
};
|
|
template <>
|
|
struct TestTypeHelper<Type::TENSOR_QUANT8_ASYMM> {
|
|
using ElementType = uint8_t;
|
|
static constexpr bool kIsQuantized = true;
|
|
// Four uint8_t of value (1) packed into uint32_t
|
|
static constexpr uint32_t kClearData = 0x01010101;
|
|
};
|
|
template <>
|
|
struct TestTypeHelper<Type::TENSOR_QUANT8_ASYMM_SIGNED> {
|
|
using ElementType = int8_t;
|
|
static constexpr bool kIsQuantized = true;
|
|
// Four int8_t of value (1) packed into uint32_t
|
|
static constexpr uint32_t kClearData = 0x01010101;
|
|
};
|
|
|
|
bool isExtensionSupported(const std::vector<VkExtensionProperties>& supportedExtensions,
|
|
const char* requestedExtension) {
|
|
return std::any_of(supportedExtensions.begin(), supportedExtensions.end(),
|
|
[requestedExtension](const auto& extension) {
|
|
return strcmp(extension.extensionName, requestedExtension) == 0;
|
|
});
|
|
}
|
|
|
|
// Records the workgroup size and the group counts of dispatching the compute shader.
|
|
struct DispatchSize {
|
|
uint32_t workgroupSize;
|
|
uint32_t groupCountX;
|
|
uint32_t groupCountY;
|
|
};
|
|
|
|
// Choose an appropriate dispatch size. We are using a square workgroup size.
|
|
template <Type dataType>
|
|
DispatchSize chooseDispatchSize(const VkPhysicalDeviceLimits& limits) {
|
|
// Compute the number of invocations along each dimension.
|
|
const uint32_t elementSize = sizeof(typename TestTypeHelper<dataType>::ElementType);
|
|
const uint32_t numberOfElementsPerInvocation = sizeof(uint32_t) / elementSize;
|
|
const uint32_t workgroupInvocationsX = kOperandSizeX / numberOfElementsPerInvocation;
|
|
const uint32_t workgroupInvocationsY = kOperandSizeY;
|
|
|
|
// Make sure the workgroup size does not exceed the number of invocations along the X and Y
|
|
// dimensions.
|
|
uint32_t workgroupSize = std::min(workgroupInvocationsX, workgroupInvocationsY);
|
|
|
|
// Make sure the workgroup size does not exceed the device limit along the X and Y dimensions.
|
|
workgroupSize = std::min<uint32_t>(workgroupSize, limits.maxComputeWorkGroupSize[0]);
|
|
workgroupSize = std::min<uint32_t>(workgroupSize, limits.maxComputeWorkGroupSize[1]);
|
|
|
|
// Make sure the total number of invocations does not exceed the device limit.
|
|
uint32_t maxSquareWorkGroupSize =
|
|
static_cast<uint32_t>(std::sqrt(limits.maxComputeWorkGroupInvocations));
|
|
workgroupSize = std::min(workgroupSize, maxSquareWorkGroupSize);
|
|
|
|
// Round down to a power of 2. This is to make sure workgroupInvocationsX and
|
|
// workgroupInvocationsY are divisible by the workgroup size so that we don't need to apply
|
|
// bound check in the shader.
|
|
uint32_t power = static_cast<uint32_t>(std::log2(static_cast<float>(workgroupSize)));
|
|
workgroupSize = 1u << power;
|
|
CHECK(workgroupInvocationsX % workgroupSize == 0);
|
|
CHECK(workgroupInvocationsY % workgroupSize == 0);
|
|
|
|
return {
|
|
.workgroupSize = workgroupSize,
|
|
.groupCountX = workgroupInvocationsX / workgroupSize,
|
|
.groupCountY = workgroupInvocationsY / workgroupSize,
|
|
};
|
|
}
|
|
|
|
// Find the first memory index that satisfies the requirements
|
|
// See VkAndroidHardwareBufferPropertiesANDROID::memoryTypeBits for the semantics of
|
|
// "memoryTypeBitsRequirement"
|
|
std::optional<uint32_t> findMemoryType(const VkPhysicalDeviceMemoryProperties& properties,
|
|
uint32_t memoryTypeBitsRequirement,
|
|
VkDeviceSize sizeRequirement) {
|
|
for (uint32_t memoryIndex = 0; memoryIndex < VK_MAX_MEMORY_TYPES; ++memoryIndex) {
|
|
const uint32_t memoryTypeBits = (1 << memoryIndex);
|
|
const bool isRequiredMemoryType = memoryTypeBitsRequirement & memoryTypeBits;
|
|
const uint32_t heapIndex = properties.memoryTypes[memoryIndex].heapIndex;
|
|
const bool isLargeEnough = properties.memoryHeaps[heapIndex].size >= sizeRequirement;
|
|
if (isRequiredMemoryType && isLargeEnough) return memoryIndex;
|
|
}
|
|
|
|
// failed to find memory type.
|
|
return std::nullopt;
|
|
}
|
|
|
|
void addBufferTransitionBarrier(VkCommandBuffer commandBuffer, VkBuffer buffer,
|
|
VkPipelineStageFlags srcStageMask,
|
|
VkPipelineStageFlags dstStageMask, VkAccessFlags srcAccessMask,
|
|
VkAccessFlags dstAccessMask, uint32_t srcQueue, uint32_t dstQueue) {
|
|
const VkBufferMemoryBarrier bufferBarrier = {
|
|
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
|
|
.pNext = nullptr,
|
|
.srcAccessMask = srcAccessMask,
|
|
.dstAccessMask = dstAccessMask,
|
|
.srcQueueFamilyIndex = srcQueue,
|
|
.dstQueueFamilyIndex = dstQueue,
|
|
.buffer = buffer,
|
|
.offset = 0,
|
|
.size = VK_WHOLE_SIZE,
|
|
};
|
|
vkCmdPipelineBarrier(commandBuffer, srcStageMask, dstStageMask, 0, 0, nullptr, 1,
|
|
&bufferBarrier, 0, nullptr);
|
|
}
|
|
|
|
void allocateBlobAhwb(uint32_t size, uint64_t usage, AHardwareBuffer** outAhwb) {
|
|
AHardwareBuffer_Desc desc = {
|
|
.width = size,
|
|
.height = 1u,
|
|
.layers = 1u,
|
|
.format = AHARDWAREBUFFER_FORMAT_BLOB,
|
|
.usage = usage,
|
|
};
|
|
ASSERT_EQ(AHardwareBuffer_allocate(&desc, outAhwb), 0);
|
|
}
|
|
|
|
using NameAndDevice = std::pair<const char*, const ANeuralNetworksDevice*>;
|
|
|
|
void getNnapiDevices(std::vector<NameAndDevice>* outDevices) {
|
|
// Get the number of available NNAPI devices
|
|
uint32_t numDevices = 0;
|
|
ASSERT_EQ(ANeuralNetworks_getDeviceCount(&numDevices), ANEURALNETWORKS_NO_ERROR);
|
|
|
|
std::vector<NameAndDevice> devices;
|
|
for (uint32_t i = 0; i < numDevices; i++) {
|
|
// Get device
|
|
ANeuralNetworksDevice* device;
|
|
ASSERT_EQ(ANeuralNetworks_getDevice(/*devIndex=*/i, &device), ANEURALNETWORKS_NO_ERROR);
|
|
|
|
// Get device name
|
|
const char* deviceName = nullptr;
|
|
ASSERT_EQ(ANeuralNetworksDevice_getName(device, &deviceName), ANEURALNETWORKS_NO_ERROR);
|
|
|
|
// Check device feature level. This test is added in NNAPI feature level 5, so skip if the
|
|
// device is of a lower feature level.
|
|
int64_t featureLevel;
|
|
ASSERT_EQ(ANeuralNetworksDevice_getFeatureLevel(device, &featureLevel),
|
|
ANEURALNETWORKS_NO_ERROR);
|
|
if (featureLevel < ANEURALNETWORKS_FEATURE_LEVEL_5) {
|
|
continue;
|
|
}
|
|
|
|
devices.emplace_back(deviceName, device);
|
|
}
|
|
*outDevices = std::move(devices);
|
|
}
|
|
|
|
std::vector<NameAndDevice> getNnapiDevices() {
|
|
std::vector<NameAndDevice> devices;
|
|
getNnapiDevices(&devices);
|
|
return devices;
|
|
}
|
|
|
|
std::string printGpuNnapiTest(const testing::TestParamInfo<NameAndDevice>& info) {
|
|
std::string name = info.param.first;
|
|
// gtest test names must only contain alphanumeric characters
|
|
std::replace_if(
|
|
name.begin(), name.end(), [](char c) { return !std::isalnum(c); }, '_');
|
|
return name;
|
|
}
|
|
|
|
template <Type dataType>
|
|
class VulkanComputePipeline {
|
|
public:
|
|
// Returns the created object on success, or nullptr on failure.
|
|
static std::unique_ptr<VulkanComputePipeline> create(AHardwareBuffer* output) {
|
|
auto pipeline = std::make_unique<VulkanComputePipeline>();
|
|
pipeline->initialize(output);
|
|
return pipeline->mIsValid ? std::move(pipeline) : nullptr;
|
|
}
|
|
|
|
~VulkanComputePipeline() {
|
|
if (mDevice != VK_NULL_HANDLE) {
|
|
vkDestroyFence(mDevice, mFence, nullptr);
|
|
vkDestroyPipeline(mDevice, mPipeline, nullptr);
|
|
vkDestroyDescriptorSetLayout(mDevice, mDescriptorSetLayout, nullptr);
|
|
vkDestroyPipelineLayout(mDevice, mPipelineLayout, nullptr);
|
|
vkFreeMemory(mDevice, mOutputBufferMemory, nullptr);
|
|
vkDestroyBuffer(mDevice, mOutputBuffer, nullptr);
|
|
vkDestroyShaderModule(mDevice, mShaderModule, nullptr);
|
|
vkDestroyCommandPool(mDevice, mCommandPool, nullptr);
|
|
vkDestroyDescriptorPool(mDevice, mDescriptorPool, nullptr);
|
|
}
|
|
vkDestroyDevice(mDevice, nullptr);
|
|
vkDestroyInstance(mInstance, nullptr);
|
|
}
|
|
|
|
// Returns {success, sync_fd}
|
|
std::pair<bool, base::unique_fd> run() {
|
|
bool success = false;
|
|
base::unique_fd outSyncFd;
|
|
runInternal(&success, &outSyncFd);
|
|
return {success, std::move(outSyncFd)};
|
|
}
|
|
|
|
private:
|
|
void initialize(AHardwareBuffer* output) {
|
|
// Create instance
|
|
const VkApplicationInfo applicationDesc = {
|
|
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
|
|
.pApplicationName = "TestGpuNnapi",
|
|
.applicationVersion = VK_MAKE_VERSION(1, 0, 0),
|
|
.apiVersion = VK_API_VERSION_1_1,
|
|
};
|
|
const VkInstanceCreateInfo instanceDesc = {
|
|
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
|
|
.pApplicationInfo = &applicationDesc,
|
|
.enabledLayerCount = 0,
|
|
.ppEnabledLayerNames = nullptr,
|
|
.enabledExtensionCount = 0,
|
|
.ppEnabledExtensionNames = nullptr,
|
|
};
|
|
ASSERT_EQ(vkCreateInstance(&instanceDesc, nullptr, &mInstance), VK_SUCCESS);
|
|
|
|
// Enumerate physical devices
|
|
uint32_t numberOfDevices = 0;
|
|
ASSERT_EQ(vkEnumeratePhysicalDevices(mInstance, &numberOfDevices, nullptr), VK_SUCCESS);
|
|
std::vector<VkPhysicalDevice> physicalDevices(numberOfDevices);
|
|
ASSERT_EQ(vkEnumeratePhysicalDevices(mInstance, &numberOfDevices, physicalDevices.data()),
|
|
VK_SUCCESS);
|
|
|
|
// Pick the first device with a compute queue
|
|
for (const auto& physicalDevice : physicalDevices) {
|
|
uint32_t numberOfQueueFamilies = 0;
|
|
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &numberOfQueueFamilies,
|
|
nullptr);
|
|
std::vector<VkQueueFamilyProperties> queueFamilies(numberOfQueueFamilies);
|
|
vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &numberOfQueueFamilies,
|
|
queueFamilies.data());
|
|
|
|
uint32_t pickedQueueFamilyIndex = 0;
|
|
bool hasComputeQueue = false;
|
|
for (uint32_t i = 0; i < queueFamilies.size(); i++) {
|
|
if (queueFamilies[i].queueFlags & VK_QUEUE_COMPUTE_BIT) {
|
|
pickedQueueFamilyIndex = i;
|
|
hasComputeQueue = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!hasComputeQueue) continue;
|
|
mPhysicalDevice = physicalDevice;
|
|
mQueueFamilyIndex = pickedQueueFamilyIndex;
|
|
break;
|
|
}
|
|
if (mPhysicalDevice == VK_NULL_HANDLE) {
|
|
GTEST_SKIP() << "No device can handle a compute queue";
|
|
}
|
|
|
|
// Get physical device properties
|
|
vkGetPhysicalDeviceProperties(mPhysicalDevice, &mPhysicalDeviceProperties);
|
|
vkGetPhysicalDeviceMemoryProperties(mPhysicalDevice, &mPhysicalDeviceMemoryProperties);
|
|
|
|
// Check physical device version
|
|
if (mPhysicalDeviceProperties.apiVersion < VK_API_VERSION_1_1) {
|
|
GTEST_SKIP() << "Device API version too low";
|
|
}
|
|
|
|
// Check if the physical device is able to handle the compute work
|
|
const auto dispatchSize = chooseDispatchSize<dataType>(mPhysicalDeviceProperties.limits);
|
|
if (mPhysicalDeviceProperties.limits.maxComputeWorkGroupCount[0] <
|
|
dispatchSize.groupCountX) {
|
|
GTEST_SKIP() << "Device cannot handle " << dispatchSize.groupCountX
|
|
<< " workgroups for the X dimension";
|
|
}
|
|
if (mPhysicalDeviceProperties.limits.maxComputeWorkGroupCount[1] <
|
|
dispatchSize.groupCountY) {
|
|
GTEST_SKIP() << "Device cannot handle " << dispatchSize.groupCountY
|
|
<< " workgroups for the Y dimension";
|
|
}
|
|
|
|
// Enumerate device extensions
|
|
uint32_t numberOfExtensions = 0;
|
|
ASSERT_EQ(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr,
|
|
&numberOfExtensions, nullptr),
|
|
VK_SUCCESS);
|
|
std::vector<VkExtensionProperties> extensions(numberOfExtensions);
|
|
ASSERT_EQ(vkEnumerateDeviceExtensionProperties(mPhysicalDevice, nullptr,
|
|
&numberOfExtensions, extensions.data()),
|
|
VK_SUCCESS);
|
|
|
|
// Required device extensions
|
|
std::vector<const char*> requiredDeviceExtensions = {
|
|
// The following extensions are required to import an AHardwareBuffer to Vulkan
|
|
VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_EXTENSION_NAME,
|
|
VK_EXT_QUEUE_FAMILY_FOREIGN_EXTENSION_NAME,
|
|
VK_KHR_GET_MEMORY_REQUIREMENTS_2_EXTENSION_NAME,
|
|
VK_KHR_BIND_MEMORY_2_EXTENSION_NAME,
|
|
VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
|
|
// The following extensions are required to export a sync fence
|
|
VK_KHR_EXTERNAL_FENCE_FD_EXTENSION_NAME,
|
|
VK_KHR_MAINTENANCE1_EXTENSION_NAME,
|
|
};
|
|
for (const char* requiredDeviceExtension : requiredDeviceExtensions) {
|
|
if (!isExtensionSupported(extensions, requiredDeviceExtension)) {
|
|
GTEST_SKIP() << "Device extension " << requiredDeviceExtension
|
|
<< " is not supported";
|
|
}
|
|
}
|
|
|
|
// Check external memory properties
|
|
const VkPhysicalDeviceExternalBufferInfo externalBufferInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO,
|
|
.pNext = nullptr,
|
|
.flags = 0u,
|
|
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
|
|
.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
|
|
};
|
|
VkExternalBufferProperties externalBufferProperties;
|
|
vkGetPhysicalDeviceExternalBufferProperties(mPhysicalDevice, &externalBufferInfo,
|
|
&externalBufferProperties);
|
|
if (!(externalBufferProperties.externalMemoryProperties.externalMemoryFeatures &
|
|
VK_EXTERNAL_MEMORY_FEATURE_IMPORTABLE_BIT)) {
|
|
GTEST_SKIP() << "Device is not able to import Android hardware buffer";
|
|
}
|
|
ASSERT_FALSE(externalBufferProperties.externalMemoryProperties.externalMemoryFeatures &
|
|
VK_EXTERNAL_MEMORY_FEATURE_DEDICATED_ONLY_BIT);
|
|
|
|
// Check external fence properties
|
|
const VkPhysicalDeviceExternalFenceInfo externalFenceInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_FENCE_INFO,
|
|
.pNext = nullptr,
|
|
.handleType = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
|
|
};
|
|
VkExternalFenceProperties externalFenceProperties;
|
|
vkGetPhysicalDeviceExternalFenceProperties(mPhysicalDevice, &externalFenceInfo,
|
|
&externalFenceProperties);
|
|
if (!(externalFenceProperties.externalFenceFeatures &
|
|
VK_EXTERNAL_FENCE_FEATURE_EXPORTABLE_BIT)) {
|
|
GTEST_SKIP() << "Device is not able to export Android sync fence FD";
|
|
}
|
|
|
|
// Create logical device
|
|
const float queuePriority = 1.0f;
|
|
const VkDeviceQueueCreateInfo queueDesc = {
|
|
.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
|
|
.queueFamilyIndex = mQueueFamilyIndex,
|
|
.queueCount = 1,
|
|
.pQueuePriorities = &queuePriority,
|
|
};
|
|
const VkDeviceCreateInfo deviceDesc = {
|
|
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
|
|
.queueCreateInfoCount = 1,
|
|
.pQueueCreateInfos = &queueDesc,
|
|
.enabledExtensionCount = static_cast<uint32_t>(requiredDeviceExtensions.size()),
|
|
.ppEnabledExtensionNames = requiredDeviceExtensions.data(),
|
|
.pEnabledFeatures = nullptr,
|
|
};
|
|
ASSERT_EQ(vkCreateDevice(mPhysicalDevice, &deviceDesc, nullptr, &mDevice), VK_SUCCESS);
|
|
vkGetDeviceQueue(mDevice, mQueueFamilyIndex, 0, &mQueue);
|
|
|
|
// Get extension function pointers
|
|
mPfnVkGetFenceFdKHR = reinterpret_cast<PFN_vkGetFenceFdKHR>(
|
|
vkGetDeviceProcAddr(mDevice, "vkGetFenceFdKHR"));
|
|
ASSERT_NE(mPfnVkGetFenceFdKHR, nullptr);
|
|
|
|
// Create descriptor pool
|
|
const std::vector<VkDescriptorPoolSize> descriptorPoolSizes = {
|
|
{
|
|
.type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
|
.descriptorCount = 1,
|
|
},
|
|
};
|
|
const VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
|
|
.maxSets = 1,
|
|
.poolSizeCount = static_cast<uint32_t>(descriptorPoolSizes.size()),
|
|
.pPoolSizes = descriptorPoolSizes.data(),
|
|
};
|
|
ASSERT_EQ(vkCreateDescriptorPool(mDevice, &descriptorPoolCreateInfo, nullptr,
|
|
&mDescriptorPool),
|
|
VK_SUCCESS);
|
|
|
|
// Create descriptor set layout
|
|
const std::vector<VkDescriptorSetLayoutBinding> descriptorsetLayoutBinding = {
|
|
{
|
|
.binding = 0, // output buffer
|
|
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
|
.descriptorCount = 1,
|
|
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
|
},
|
|
|
|
};
|
|
const VkDescriptorSetLayoutCreateInfo descriptorsetLayoutDesc = {
|
|
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
|
|
.bindingCount = static_cast<uint32_t>(descriptorsetLayoutBinding.size()),
|
|
.pBindings = descriptorsetLayoutBinding.data(),
|
|
};
|
|
ASSERT_EQ(vkCreateDescriptorSetLayout(mDevice, &descriptorsetLayoutDesc, nullptr,
|
|
&mDescriptorSetLayout),
|
|
VK_SUCCESS);
|
|
|
|
// Allocate descriptor set
|
|
const VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
|
|
.descriptorPool = mDescriptorPool,
|
|
.descriptorSetCount = 1,
|
|
.pSetLayouts = &mDescriptorSetLayout,
|
|
};
|
|
ASSERT_EQ(vkAllocateDescriptorSets(mDevice, &descriptorSetAllocateInfo, &mDescriptorSet),
|
|
VK_SUCCESS);
|
|
|
|
// Check the output AHardwareBuffer format and usage bits
|
|
AHardwareBuffer_Desc desc;
|
|
AHardwareBuffer_describe(output, &desc);
|
|
ASSERT_EQ(desc.format, AHARDWAREBUFFER_FORMAT_BLOB);
|
|
ASSERT_TRUE(desc.usage & AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER);
|
|
|
|
// Get AHardwareBuffer properties
|
|
VkAndroidHardwareBufferPropertiesANDROID properties = {
|
|
.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_PROPERTIES_ANDROID,
|
|
.pNext = nullptr,
|
|
};
|
|
ASSERT_EQ(vkGetAndroidHardwareBufferPropertiesANDROID(mDevice, output, &properties),
|
|
VK_SUCCESS);
|
|
|
|
// Create the output buffer with AHardwareBuffer memory
|
|
const VkExternalMemoryBufferCreateInfo externalMemoryBufferCreateInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO,
|
|
.pNext = nullptr,
|
|
.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID,
|
|
};
|
|
const VkBufferCreateInfo bufferCreateInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
|
|
.pNext = &externalMemoryBufferCreateInfo,
|
|
.flags = 0u,
|
|
.size = desc.width,
|
|
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
|
|
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
|
|
.queueFamilyIndexCount = 0u,
|
|
.pQueueFamilyIndices = nullptr,
|
|
};
|
|
ASSERT_EQ(vkCreateBuffer(mDevice, &bufferCreateInfo, nullptr, &mOutputBuffer), VK_SUCCESS);
|
|
|
|
// Find a proper memory type
|
|
const auto maybeMemoryTypeIndex =
|
|
findMemoryType(mPhysicalDeviceMemoryProperties, properties.memoryTypeBits,
|
|
properties.allocationSize);
|
|
if (!maybeMemoryTypeIndex.has_value()) {
|
|
GTEST_SKIP() << "None of the memory type is suitable for allocation";
|
|
}
|
|
|
|
// Import the AHardwareBuffer memory
|
|
const VkImportAndroidHardwareBufferInfoANDROID importMemoryAllocateInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID,
|
|
.pNext = nullptr,
|
|
.buffer = output,
|
|
};
|
|
const VkMemoryAllocateInfo memoryAllocInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
|
|
.pNext = &importMemoryAllocateInfo,
|
|
.allocationSize = properties.allocationSize,
|
|
.memoryTypeIndex = maybeMemoryTypeIndex.value(),
|
|
};
|
|
const auto allocationResult =
|
|
vkAllocateMemory(mDevice, &memoryAllocInfo, nullptr, &mOutputBufferMemory);
|
|
// Memory allocation may fail if the size exceeds the upper limit of a single allocation
|
|
// that the platform supports
|
|
if (allocationResult == VK_ERROR_OUT_OF_DEVICE_MEMORY) {
|
|
GTEST_SKIP() << "Unable to allocate device memory of " << properties.allocationSize
|
|
<< " bytes";
|
|
}
|
|
ASSERT_EQ(allocationResult, VK_SUCCESS);
|
|
|
|
// Bind the memory with the buffer
|
|
ASSERT_EQ(vkBindBufferMemory(mDevice, mOutputBuffer, mOutputBufferMemory, 0), VK_SUCCESS);
|
|
|
|
// Update the descriptor sets
|
|
const VkDescriptorBufferInfo outputBufferDesc = {
|
|
.buffer = mOutputBuffer,
|
|
.offset = 0,
|
|
.range = VK_WHOLE_SIZE,
|
|
};
|
|
const std::vector<VkWriteDescriptorSet> writeDst = {
|
|
{
|
|
.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
|
|
.pNext = nullptr,
|
|
.dstSet = mDescriptorSet,
|
|
.dstBinding = 0, // output buffer
|
|
.dstArrayElement = 0,
|
|
.descriptorCount = 1,
|
|
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
|
.pImageInfo = nullptr,
|
|
.pBufferInfo = &outputBufferDesc,
|
|
.pTexelBufferView = nullptr,
|
|
},
|
|
};
|
|
vkUpdateDescriptorSets(mDevice, writeDst.size(), writeDst.data(), 0, nullptr);
|
|
|
|
// Create shader module
|
|
const VkShaderModuleCreateInfo shaderDesc = {
|
|
.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
|
|
.flags = 0,
|
|
.codeSize = kComputeShader.size() * sizeof(uint32_t),
|
|
.pCode = kComputeShader.data(),
|
|
};
|
|
ASSERT_EQ(vkCreateShaderModule(mDevice, &shaderDesc, nullptr, &mShaderModule), VK_SUCCESS);
|
|
|
|
// Create pipeline layout
|
|
const VkPipelineLayoutCreateInfo layoutDesc = {
|
|
.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
|
|
.setLayoutCount = 1,
|
|
.pSetLayouts = &mDescriptorSetLayout,
|
|
.pushConstantRangeCount = 0,
|
|
.pPushConstantRanges = nullptr,
|
|
};
|
|
ASSERT_EQ(vkCreatePipelineLayout(mDevice, &layoutDesc, nullptr, &mPipelineLayout),
|
|
VK_SUCCESS);
|
|
|
|
// Create compute pipeline
|
|
const uint32_t specializationData[] = {
|
|
dispatchSize.workgroupSize, // local_size_x
|
|
dispatchSize.workgroupSize, // local_size_y
|
|
TestTypeHelper<dataType>::kClearData, // CLEAR_DATA
|
|
};
|
|
const std::vector<VkSpecializationMapEntry> specializationMap = {
|
|
// {constantID, offset, size}
|
|
{0, 0 * sizeof(uint32_t), sizeof(uint32_t)},
|
|
{1, 1 * sizeof(uint32_t), sizeof(uint32_t)},
|
|
{2, 2 * sizeof(uint32_t), sizeof(uint32_t)},
|
|
};
|
|
const VkSpecializationInfo specializationInfo = {
|
|
.mapEntryCount = static_cast<uint32_t>(specializationMap.size()),
|
|
.pMapEntries = specializationMap.data(),
|
|
.dataSize = sizeof(specializationData),
|
|
.pData = specializationData,
|
|
};
|
|
const VkComputePipelineCreateInfo pipelineDesc = {
|
|
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
|
|
.stage =
|
|
{
|
|
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
|
|
.stage = VK_SHADER_STAGE_COMPUTE_BIT,
|
|
.module = mShaderModule,
|
|
.pName = "main",
|
|
.pSpecializationInfo = &specializationInfo,
|
|
},
|
|
.layout = mPipelineLayout,
|
|
};
|
|
ASSERT_EQ(vkCreateComputePipelines(mDevice, VK_NULL_HANDLE, 1, &pipelineDesc, nullptr,
|
|
&mPipeline),
|
|
VK_SUCCESS);
|
|
|
|
// Create command pool
|
|
const VkCommandPoolCreateInfo cmdpoolDesc = {
|
|
.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
|
|
.flags = 0u,
|
|
.queueFamilyIndex = mQueueFamilyIndex,
|
|
};
|
|
ASSERT_EQ(vkCreateCommandPool(mDevice, &cmdpoolDesc, nullptr, &mCommandPool), VK_SUCCESS);
|
|
|
|
// Create a command buffer
|
|
const VkCommandBufferAllocateInfo cmdBufferCreateInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
|
|
.pNext = nullptr,
|
|
.commandPool = mCommandPool,
|
|
.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
|
|
.commandBufferCount = 1,
|
|
};
|
|
ASSERT_EQ(vkAllocateCommandBuffers(mDevice, &cmdBufferCreateInfo, &mCommandBuffer),
|
|
VK_SUCCESS);
|
|
|
|
// Record command buffer
|
|
const VkCommandBufferBeginInfo commandBufferBeginInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
|
|
.pNext = nullptr,
|
|
.flags = 0,
|
|
.pInheritanceInfo = nullptr,
|
|
};
|
|
ASSERT_EQ(vkBeginCommandBuffer(mCommandBuffer, &commandBufferBeginInfo), VK_SUCCESS);
|
|
|
|
// Buffer barrier to acquire the ownership of the output buffer
|
|
addBufferTransitionBarrier(mCommandBuffer, mOutputBuffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
|
|
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0,
|
|
VK_ACCESS_SHADER_WRITE_BIT, VK_QUEUE_FAMILY_FOREIGN_EXT,
|
|
mQueueFamilyIndex);
|
|
|
|
// Setup resources
|
|
vkCmdBindPipeline(mCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipeline);
|
|
vkCmdBindDescriptorSets(mCommandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, mPipelineLayout, 0,
|
|
1, &mDescriptorSet, 0, nullptr);
|
|
|
|
// Dispatch compute
|
|
vkCmdDispatch(mCommandBuffer, dispatchSize.groupCountX, dispatchSize.groupCountY, 1);
|
|
|
|
// Buffer barrier to release the ownership of the output buffer
|
|
addBufferTransitionBarrier(mCommandBuffer, mOutputBuffer,
|
|
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
|
VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_ACCESS_SHADER_WRITE_BIT,
|
|
0, mQueueFamilyIndex, VK_QUEUE_FAMILY_FOREIGN_EXT);
|
|
|
|
// Finish recording the command buffer
|
|
ASSERT_EQ(vkEndCommandBuffer(mCommandBuffer), VK_SUCCESS);
|
|
|
|
// Create fence
|
|
const VkExportFenceCreateInfo exportFenceCreateInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_EXPORT_FENCE_CREATE_INFO,
|
|
.pNext = nullptr,
|
|
.handleTypes = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
|
|
};
|
|
const VkFenceCreateInfo fenceCreateInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
|
|
.pNext = &exportFenceCreateInfo,
|
|
.flags = 0,
|
|
};
|
|
ASSERT_EQ(vkCreateFence(mDevice, &fenceCreateInfo, nullptr, &mFence), VK_SUCCESS);
|
|
|
|
mIsValid = true;
|
|
}
|
|
|
|
void runInternal(bool* outSuccess, base::unique_fd* outSyncFd) {
|
|
*outSuccess = false;
|
|
|
|
// Submit to queue
|
|
const VkSubmitInfo submitInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
|
|
.waitSemaphoreCount = 0,
|
|
.pWaitSemaphores = nullptr,
|
|
.pWaitDstStageMask = nullptr,
|
|
.commandBufferCount = 1,
|
|
.pCommandBuffers = &mCommandBuffer,
|
|
.signalSemaphoreCount = 0,
|
|
.pSignalSemaphores = nullptr,
|
|
};
|
|
ASSERT_EQ(vkResetFences(mDevice, 1, &mFence), VK_SUCCESS);
|
|
ASSERT_EQ(vkQueueSubmit(mQueue, 1, &submitInfo, mFence), VK_SUCCESS);
|
|
|
|
// Export a Android sync fence FD
|
|
int syncFd = -1;
|
|
const VkFenceGetFdInfoKHR fenceGetFdInfo = {
|
|
.sType = VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR,
|
|
.pNext = nullptr,
|
|
.fence = mFence,
|
|
.handleType = VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT,
|
|
};
|
|
ASSERT_EQ(mPfnVkGetFenceFdKHR(mDevice, &fenceGetFdInfo, &syncFd), VK_SUCCESS);
|
|
*outSyncFd = base::unique_fd(syncFd);
|
|
|
|
*outSuccess = true;
|
|
}
|
|
|
|
// Instance
|
|
VkInstance mInstance = VK_NULL_HANDLE;
|
|
|
|
// Physical device and queue family
|
|
VkPhysicalDevice mPhysicalDevice = VK_NULL_HANDLE;
|
|
VkPhysicalDeviceProperties mPhysicalDeviceProperties{};
|
|
VkPhysicalDeviceMemoryProperties mPhysicalDeviceMemoryProperties{};
|
|
uint32_t mQueueFamilyIndex = 0;
|
|
|
|
// Logical device and queue
|
|
VkDevice mDevice = VK_NULL_HANDLE;
|
|
VkQueue mQueue = VK_NULL_HANDLE;
|
|
|
|
// Extension functions
|
|
PFN_vkGetFenceFdKHR mPfnVkGetFenceFdKHR = nullptr;
|
|
|
|
// Resource descriptors
|
|
VkDescriptorPool mDescriptorPool = VK_NULL_HANDLE;
|
|
VkDescriptorSetLayout mDescriptorSetLayout = VK_NULL_HANDLE;
|
|
VkDescriptorSet mDescriptorSet = VK_NULL_HANDLE;
|
|
|
|
// Output buffer
|
|
VkBuffer mOutputBuffer = VK_NULL_HANDLE;
|
|
VkDeviceMemory mOutputBufferMemory = VK_NULL_HANDLE;
|
|
|
|
// Compute pipeline
|
|
VkShaderModule mShaderModule = VK_NULL_HANDLE;
|
|
VkPipelineLayout mPipelineLayout = VK_NULL_HANDLE;
|
|
VkPipeline mPipeline = VK_NULL_HANDLE;
|
|
|
|
// Command buffer
|
|
VkCommandPool mCommandPool = VK_NULL_HANDLE;
|
|
VkCommandBuffer mCommandBuffer = VK_NULL_HANDLE;
|
|
VkFence mFence = VK_NULL_HANDLE;
|
|
|
|
bool mIsValid = false;
|
|
};
|
|
|
|
template <Type dataType>
|
|
class NnapiExecutor {
|
|
public:
|
|
// Returns the created object on success, or nullptr on failure.
|
|
static std::unique_ptr<NnapiExecutor> create(const ANeuralNetworksDevice* device,
|
|
AHardwareBuffer* input, AHardwareBuffer* output) {
|
|
auto nnapi = std::make_unique<NnapiExecutor>(input, output);
|
|
nnapi->initialize(device);
|
|
return nnapi->mIsValid ? std::move(nnapi) : nullptr;
|
|
}
|
|
|
|
// Prefer NnapiExecutor::create
|
|
NnapiExecutor(AHardwareBuffer* input, AHardwareBuffer* output)
|
|
: mInputMemory(input), mOutputMemory(output) {}
|
|
|
|
// Returns {success, sync_fd}
|
|
std::pair<bool, base::unique_fd> run(const base::unique_fd& inSyncFd) {
|
|
bool success = false;
|
|
base::unique_fd outSyncFd;
|
|
runInternal(inSyncFd, &success, &outSyncFd);
|
|
return {success, std::move(outSyncFd)};
|
|
}
|
|
|
|
private:
|
|
using ElementType = typename TestTypeHelper<dataType>::ElementType;
|
|
|
|
void initialize(const ANeuralNetworksDevice* device) {
|
|
ASSERT_TRUE(mInputMemory.isValid());
|
|
ASSERT_TRUE(mOutputMemory.isValid());
|
|
|
|
// Model input
|
|
const float scale = TestTypeHelper<dataType>::kIsQuantized ? 1.0f : 0.0f;
|
|
const OperandType tensorType(dataType, {kOperandSizeY, kOperandSizeX}, scale,
|
|
/*zeroPoint=*/0);
|
|
uint32_t inputTensor = mModel.addOperand(&tensorType);
|
|
|
|
// Constant tensor
|
|
const OperandType constTensorType(dataType, {1}, scale, /*zeroPoint=*/0);
|
|
const ElementType constTensorData = static_cast<ElementType>(1);
|
|
uint32_t constTensor =
|
|
mModel.addConstantOperand<ElementType>(&constTensorType, constTensorData);
|
|
|
|
// Activation (NONE)
|
|
const OperandType activationType(Type::INT32, {});
|
|
uint32_t activationScalar = mModel.addConstantOperand<int32_t>(&activationType, 0);
|
|
|
|
// Model output
|
|
uint32_t outputTensor = mModel.addOperand(&tensorType);
|
|
|
|
// Model operation
|
|
mModel.addOperation(ANEURALNETWORKS_ADD, {inputTensor, constTensor, activationScalar},
|
|
{outputTensor});
|
|
|
|
// Finish model
|
|
mModel.identifyInputsAndOutputs({inputTensor}, {outputTensor});
|
|
mModel.relaxComputationFloat32toFloat16(/*isRelax=*/true);
|
|
ASSERT_TRUE(mModel.isValid());
|
|
ASSERT_EQ(mModel.finish(), Result::NO_ERROR);
|
|
|
|
// Create compilation for the target device
|
|
Result result;
|
|
std::tie(result, mCompilation) =
|
|
test_wrapper::Compilation::createForDevice(&mModel, device);
|
|
ASSERT_EQ(result, Result::NO_ERROR);
|
|
|
|
// Finish the compilation
|
|
result = mCompilation.finish();
|
|
if (result != Result::NO_ERROR) {
|
|
GTEST_SKIP() << "Model is not supported by the device";
|
|
}
|
|
|
|
mIsValid = true;
|
|
}
|
|
|
|
void runInternal(const base::unique_fd& inSyncFd, bool* outSuccess,
|
|
base::unique_fd* outSyncFd) {
|
|
*outSuccess = false;
|
|
|
|
// Setup execution
|
|
mExecution = std::make_unique<test_wrapper::Execution>(&mCompilation);
|
|
ASSERT_EQ(mExecution->setInputFromMemory(/*index=*/0, &mInputMemory, /*offset=*/0,
|
|
kOperandLength * sizeof(ElementType)),
|
|
Result::NO_ERROR);
|
|
ASSERT_EQ(mExecution->setOutputFromMemory(/*index=*/0, &mOutputMemory, /*offset=*/0,
|
|
kOperandLength * sizeof(ElementType)),
|
|
Result::NO_ERROR);
|
|
|
|
// Setup dependencies
|
|
std::vector<const test_wrapper::Event*> dependencies;
|
|
test_wrapper::Event start;
|
|
// The sync fence from Vulkan may not be valid if GPU workload has already finished
|
|
// prior to exporting the fence.
|
|
if (inSyncFd.ok()) {
|
|
start = test_wrapper::Event(inSyncFd.get());
|
|
ASSERT_TRUE(start.isValid());
|
|
dependencies = {&start};
|
|
}
|
|
|
|
// Fenced compute
|
|
test_wrapper::Event finished;
|
|
mExecution->startComputeWithDependencies(dependencies, /*infinite timeout*/ 0, &finished);
|
|
|
|
// Get the output sync fence if supported; Otherwise, wait until the execution is finished
|
|
int syncFd = -1;
|
|
finished.getSyncFenceFd(&syncFd);
|
|
if (syncFd == -1) {
|
|
ASSERT_EQ(finished.wait(), Result::NO_ERROR);
|
|
}
|
|
*outSyncFd = base::unique_fd(syncFd);
|
|
*outSuccess = true;
|
|
}
|
|
|
|
test_wrapper::Model mModel;
|
|
test_wrapper::Compilation mCompilation;
|
|
std::unique_ptr<test_wrapper::Execution> mExecution;
|
|
test_wrapper::Memory mInputMemory, mOutputMemory;
|
|
bool mIsValid = false;
|
|
};
|
|
|
|
class GpuNnapiTest : public testing::TestWithParam<NameAndDevice> {
|
|
protected:
|
|
void TearDown() override {
|
|
if (mGpuOutput) {
|
|
AHardwareBuffer_release(mGpuOutput);
|
|
}
|
|
if (mNnapiOutput) {
|
|
AHardwareBuffer_release(mNnapiOutput);
|
|
}
|
|
}
|
|
|
|
template <Type dataType>
|
|
void runTest() {
|
|
#ifndef NNTEST_ONLY_PUBLIC_API
|
|
if (DeviceManager::get()->getUseCpuOnly()) {
|
|
GTEST_SKIP();
|
|
}
|
|
#endif
|
|
|
|
// Allocate hardware buffers for GPU and NNAPI outputs
|
|
const size_t size = kOperandLength * sizeof(typename TestTypeHelper<dataType>::ElementType);
|
|
allocateBlobAhwb(
|
|
size, AHARDWAREBUFFER_USAGE_GPU_DATA_BUFFER | AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN,
|
|
&mGpuOutput);
|
|
allocateBlobAhwb(
|
|
size, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN | AHARDWAREBUFFER_USAGE_CPU_WRITE_OFTEN,
|
|
&mNnapiOutput);
|
|
if (mGpuOutput == nullptr || mNnapiOutput == nullptr) return;
|
|
|
|
// Create Vulkan compute pipeline
|
|
auto vulkan = VulkanComputePipeline<dataType>::create(mGpuOutput);
|
|
if (vulkan == nullptr) return;
|
|
|
|
// Create NNAPI executor
|
|
auto nnapi = NnapiExecutor<dataType>::create(kDevice, mGpuOutput, mNnapiOutput);
|
|
if (nnapi == nullptr) return;
|
|
|
|
// Run the test repeatly for kNumberOfIterationsToTest iterations
|
|
for (uint32_t i = 0; i < kNumberOfIterationsToTest; i++) {
|
|
auto [gpuSuccess, gpuSyncFd] = vulkan->run();
|
|
ASSERT_TRUE(gpuSuccess);
|
|
|
|
auto [nnapiSuccess, nnapiSyncFd] = nnapi->run(gpuSyncFd);
|
|
ASSERT_TRUE(nnapiSuccess);
|
|
|
|
checkResults<dataType>(std::move(nnapiSyncFd));
|
|
}
|
|
}
|
|
|
|
template <Type dataType>
|
|
void checkResults(base::unique_fd syncFd) {
|
|
using ElementType = typename TestTypeHelper<dataType>::ElementType;
|
|
|
|
// Lock the buffer with the sync fence
|
|
// AHardwareBuffer_lock will take the ownership and close the sync fence even on errors
|
|
void* data;
|
|
ASSERT_EQ(AHardwareBuffer_lock(mNnapiOutput, AHARDWAREBUFFER_USAGE_CPU_READ_OFTEN,
|
|
syncFd.release(), /*rect=*/nullptr, &data),
|
|
0);
|
|
|
|
// Compare the actual results with the expect value
|
|
uint32_t numberOfErrors = 0;
|
|
const ElementType expected = static_cast<ElementType>(kExpectedResultInInt);
|
|
for (uint32_t i = 0; i < kOperandLength; i++) {
|
|
const ElementType actual = reinterpret_cast<ElementType*>(data)[i];
|
|
|
|
// We expect bit-exact here because the arithmetic is trivial, and all intermediate
|
|
// and final results can be exactly represented by the primary data type.
|
|
if (actual != expected) {
|
|
// Print at most kMaxNumberOfPrintedErrors errors by EXPECT_EQ
|
|
if (numberOfErrors < kMaxNumberOfPrintedErrors) {
|
|
EXPECT_EQ(actual, expected)
|
|
<< "When comparing element [" << kOperandLength / kOperandSizeX << ", "
|
|
<< kOperandLength % kOperandSizeX << "]";
|
|
}
|
|
numberOfErrors++;
|
|
}
|
|
}
|
|
EXPECT_EQ(numberOfErrors, 0u);
|
|
ASSERT_EQ(AHardwareBuffer_unlock(mNnapiOutput, /*fence=*/nullptr), 0);
|
|
}
|
|
|
|
// The NNAPI device under test
|
|
const ANeuralNetworksDevice* kDevice = GetParam().second;
|
|
|
|
AHardwareBuffer* mGpuOutput = nullptr;
|
|
AHardwareBuffer* mNnapiOutput = nullptr;
|
|
};
|
|
|
|
TEST_P(GpuNnapiTest, Float32) {
|
|
runTest<Type::TENSOR_FLOAT32>();
|
|
}
|
|
TEST_P(GpuNnapiTest, Float16) {
|
|
runTest<Type::TENSOR_FLOAT16>();
|
|
}
|
|
TEST_P(GpuNnapiTest, Quant8Asymm) {
|
|
runTest<Type::TENSOR_QUANT8_ASYMM>();
|
|
}
|
|
TEST_P(GpuNnapiTest, Quant8AsymmSigned) {
|
|
runTest<Type::TENSOR_QUANT8_ASYMM_SIGNED>();
|
|
}
|
|
|
|
INSTANTIATE_TEST_SUITE_P(TestGpuNnapi, GpuNnapiTest, testing::ValuesIn(getNnapiDevices()),
|
|
printGpuNnapiTest);
|
|
|
|
} // namespace
|
|
} // namespace android::nn
|