You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1301 lines
58 KiB

/*
* Copyright (C) 2017 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define LOG_TAG "Manager"
#include "Manager.h"
#include <CpuExecutor.h>
#include <LegacyUtils.h>
#include <MetaModel.h>
#include <Tracing.h>
#include <nnapi/IBurst.h>
#include <nnapi/IDevice.h>
#include <nnapi/IExecution.h>
#include <nnapi/IPreparedModel.h>
#include <nnapi/SharedMemory.h>
#include <nnapi/Types.h>
#include <nnapi/Validation.h>
#include <algorithm>
#include <functional>
#include <iterator>
#include <map>
#include <memory>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
#include "ExecutionCallback.h"
#include "FeatureLevel.h"
#include "Memory.h"
#include "ModelArgumentInfo.h"
#include "TypeManager.h"
#ifndef NN_COMPATIBILITY_LIBRARY_BUILD
#include <build/version.h>
#include <cutils/native_handle.h>
#include <nnapi/hal/1.3/Buffer.h>
#include <nnapi/hal/Service.h>
#include "AppInfoFetcher.h"
#endif // NN_COMPATIBILITY_LIBRARY_BUILD
namespace android {
namespace nn {
// A Device with actual underlying driver
class DriverDevice : public Device {
public:
// Create a DriverDevice from a name and a DeviceFactory function.
// Returns nullptr on failure.
static std::shared_ptr<DriverDevice> create(SharedDevice device, bool isUpdatable = false);
// Prefer using DriverDevice::create
explicit DriverDevice(SharedDevice device, bool isUpdatable);
const std::string& getName() const override { return kInterface->getName(); }
const std::string& getVersionString() const override { return kInterface->getVersionString(); }
int64_t getFeatureLevel() const override;
int32_t getType() const override { return static_cast<int32_t>(kInterface->getType()); }
bool isUpdatable() const override { return kIsUpdatable; }
const std::vector<Extension>& getSupportedExtensions() const override {
return kInterface->getSupportedExtensions();
}
std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
const Capabilities& getCapabilities() const override { return kInterface->getCapabilities(); }
Capabilities::PerformanceInfo getPerformance(OperandType type) const override {
return getCapabilities().operandPerformance.lookup(type);
}
Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
return getCapabilities().relaxedFloat32toFloat16PerformanceScalar;
}
Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
return getCapabilities().relaxedFloat32toFloat16PerformanceTensor;
}
Capabilities::PerformanceInfo getIfPerformance() const override {
return getCapabilities().ifPerformance;
}
Capabilities::PerformanceInfo getWhilePerformance() const override {
return getCapabilities().whilePerformance;
}
std::pair<uint32_t, uint32_t> getNumberOfCacheFilesNeeded() const override {
return kInterface->getNumberOfCacheFilesNeeded();
}
bool isCachingSupported() const override {
// Caching is supported if either of numModelCache or numDataCache is greater than 0.
const auto [numModelCacheFiles, numDataCacheFiles] = getNumberOfCacheFilesNeeded();
return numModelCacheFiles > 0 || numDataCacheFiles > 0;
}
int wait() const override {
auto result = kInterface->wait();
if (!result.ok()) {
LOG(ERROR) << "DriverDevice::wait error: " << result.error().message;
return convertErrorStatusToResultCode(result.error().code);
}
return ANEURALNETWORKS_NO_ERROR;
}
std::pair<int, std::shared_ptr<RuntimePreparedModel>> prepareModel(
const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
const std::optional<CacheToken>& maybeToken) const override;
std::pair<int, std::unique_ptr<RuntimeMemory>> allocate(const MemoryDescriptor& desc,
OperandType) const override;
private:
const SharedDevice kInterface;
const bool kIsUpdatable;
GeneralResult<std::vector<bool>> getSupportedOperationsImpl(const MetaModel& metaModel) const;
GeneralResult<SharedPreparedModel> prepareModelFromCacheInternal(
const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
const CacheToken& token) const;
#ifdef NN_DEBUGGABLE
// For debugging: behavior of IDevice::getSupportedOperations for SampleDriver.
// 0 - all operations reported by IDevice::getSupportedOperations() supported
// 1 - some operations reported by IDevice::getSupportedOperations() supported
uint32_t mSupported = 0;
#endif // NN_DEBUGGABLE
};
// A RuntimePreparedModel with underlying IPreparedModel instance return by actual driver.
class DriverPreparedModel : public RuntimePreparedModel {
public:
DriverPreparedModel(const Device* device, const SharedPreparedModel& preparedModel)
: mDevice(device), mPreparedModel(preparedModel) {
CHECK(mDevice != nullptr);
CHECK(mPreparedModel != nullptr);
}
const Device* getDevice() const override { return mDevice; }
SharedPreparedModel getInterface() const override { return mPreparedModel; }
std::tuple<int, std::vector<OutputShape>, Timing> execute(
const std::vector<ModelArgumentInfo>& inputs,
const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories, const SharedBurst& burstController,
MeasureTiming measure, const OptionalTimePoint& deadline,
const OptionalDuration& loopTimeoutDuration) const override;
std::tuple<int, int, ExecuteFencedInfoCallback, Timing> executeFenced(
const std::vector<ModelArgumentInfo>& inputs,
const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
MeasureTiming measure, const OptionalTimePoint& deadline,
const OptionalDuration& loopTimeoutDuration,
const OptionalDuration& timeoutDurationAfterFence) const override;
std::pair<int, std::shared_ptr<RuntimeExecution>> createReusableExecution(
const std::vector<ModelArgumentInfo>& inputs,
const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories, MeasureTiming measure,
const OptionalDuration& loopTimeoutDuration) const override;
GeneralResult<SharedBurst> configureExecutionBurst() const override {
return mPreparedModel->configureExecutionBurst();
}
MemoryPreference getMemoryPreference() const override {
if (mDevice->getFeatureLevel() >= ANEURALNETWORKS_FEATURE_LEVEL_5) {
return {kDefaultRequestMemoryAlignment, kDefaultRequestMemoryPadding};
} else {
// We are not able to pass memory padding information to HIDL drivers, so return the
// minimum padding.
return {kDefaultRequestMemoryAlignment, kMinMemoryPadding};
}
}
private:
const Device* mDevice;
const SharedPreparedModel mPreparedModel;
};
class DriverExecution : public RuntimeExecution {
public:
DriverExecution(SharedExecution execution, Request request,
std::vector<const RuntimeMemory*> memories, MeasureTiming measure,
OptionalDuration loopTimeoutDuration, int64_t deviceFeatureLevel)
: kExecution(std::move(execution)),
kRequest(std::move(request)),
kMemories(std::move(memories)),
kMeasure(measure),
kLoopTimeoutDuration(std::move(loopTimeoutDuration)),
kDeviceFeatureLevel(deviceFeatureLevel) {
CHECK(kExecution != nullptr);
}
std::tuple<int, std::vector<OutputShape>, Timing> compute(
const SharedBurst& burstController, const OptionalTimePoint& deadline) const override;
std::tuple<int, int, ExecuteFencedInfoCallback, Timing> computeFenced(
const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
const OptionalDuration& timeoutDurationAfterFence) const override;
private:
const SharedExecution kExecution;
// For burst execution.
const Request kRequest;
const std::vector<const RuntimeMemory*> kMemories;
const MeasureTiming kMeasure;
const OptionalDuration kLoopTimeoutDuration;
mutable std::map<const IBurst*, SharedExecution> mCachedBurstExecutions;
// For fenced execution.
const int64_t kDeviceFeatureLevel;
};
DriverDevice::DriverDevice(SharedDevice device, bool isUpdatable)
: kInterface(std::move(device)), kIsUpdatable(isUpdatable) {
CHECK(kInterface != nullptr);
#ifdef NN_DEBUGGABLE
static const char samplePrefix[] = "sample";
if (getName().substr(0, sizeof(samplePrefix) - 1) == samplePrefix) {
mSupported = getProp("debug.nn.sample.supported");
}
#endif // NN_DEBUGGABLE
}
std::shared_ptr<DriverDevice> DriverDevice::create(SharedDevice device, bool isUpdatable) {
if (device == nullptr) {
LOG(ERROR) << "DriverDevice::create called with nullptr";
return nullptr;
}
return std::make_shared<DriverDevice>(std::move(device), isUpdatable);
}
int64_t DriverDevice::getFeatureLevel() const {
Version featureLevel = kInterface->getFeatureLevel();
switch (featureLevel) {
case Version::ANDROID_OC_MR1:
return ANEURALNETWORKS_FEATURE_LEVEL_1;
case Version::ANDROID_P:
return ANEURALNETWORKS_FEATURE_LEVEL_2;
case Version::ANDROID_Q:
return ANEURALNETWORKS_FEATURE_LEVEL_3;
case Version::ANDROID_R:
return ANEURALNETWORKS_FEATURE_LEVEL_4;
case Version::ANDROID_S:
return ANEURALNETWORKS_FEATURE_LEVEL_5;
case Version::CURRENT_RUNTIME:
break;
}
LOG(FATAL) << "Unsupported driver feature level: " << featureLevel;
return -1;
}
GeneralResult<std::vector<bool>> DriverDevice::getSupportedOperationsImpl(
const MetaModel& metaModel) const {
const auto featureLevel = kInterface->getFeatureLevel();
const auto slice = metaModel.getSlice(featureLevel);
if (!slice.has_value()) {
return NN_ERROR() << "getSlice(" << featureLevel << ") failed";
}
const auto& [sliceModel, slicedModelOperationIndexToModelOperationIndex] = *slice;
const std::vector<bool> supported = NN_TRY(kInterface->getSupportedOperations(sliceModel));
const uint32_t slicedOperationCount = sliceModel.main.operations.size();
if (supported.size() != slicedOperationCount) {
return NN_ERROR() << "IDevice::getSupportedOperations returned a vector of length "
<< supported.size() << " when expecting " << slicedOperationCount;
}
const Model& model = metaModel.getModel();
const uint32_t operationCount = model.main.operations.size();
std::vector<bool> remappedSupported(operationCount, false);
for (size_t i = 0; i < supported.size(); ++i) {
if (supported[i]) {
remappedSupported[slicedModelOperationIndexToModelOperationIndex(i)] = true;
}
}
return remappedSupported;
}
std::vector<bool> DriverDevice::getSupportedOperations(const MetaModel& metaModel) const {
const Model& model = metaModel.getModel();
auto result = getSupportedOperationsImpl(metaModel);
if (!result.ok()) {
LOG(ERROR) << "getSupportedOperations failed with code " << result.error().code << ": "
<< result.error().message;
// Set the supported operation vectors to all false, so we won't use this driver.
return std::vector<bool>(model.main.operations.size(), false);
}
std::vector<bool>& supportedOperations = result.value();
#ifdef NN_DEBUGGABLE
if (mSupported != 1) {
return supportedOperations;
}
const uint32_t baseAccumulator = std::hash<std::string>{}(getName());
for (size_t operationIndex = 0; operationIndex < supportedOperations.size(); operationIndex++) {
if (!supportedOperations[operationIndex]) {
continue;
}
uint32_t accumulator = baseAccumulator;
const Operation& operation = model.main.operations[operationIndex];
accumulator ^= static_cast<uint32_t>(operation.type);
auto accumulateOperands = [&model, &accumulator](const std::vector<uint32_t>& operands) {
for (uint32_t operandIndex : operands) {
const Operand& operand = model.main.operands[operandIndex];
accumulator ^= static_cast<uint32_t>(operand.type);
accumulator ^= operand.dimensions.size();
for (const Dimension& dimension : operand.dimensions) {
accumulator ^= dimension;
if (operand.lifetime == Operand::LifeTime::CONSTANT_COPY ||
operand.lifetime == Operand::LifeTime::CONSTANT_REFERENCE ||
operand.lifetime == Operand::LifeTime::POINTER) {
accumulator ^= 1;
}
}
}
};
accumulateOperands(operation.inputs);
accumulateOperands(operation.outputs);
if (accumulator & 1) {
supportedOperations[operationIndex] = false;
}
}
#endif // NN_DEBUGGABLE
return supportedOperations;
}
// Opens a cache file for reading and writing and returns a shared handle.
static GeneralResult<SharedHandle> createCacheHandle(const std::string& filename,
bool createIfNotExist) {
auto fd = base::unique_fd(open(filename.c_str(), createIfNotExist ? (O_RDWR | O_CREAT) : O_RDWR,
S_IRUSR | S_IWUSR));
if (fd.get() == -1) {
return NN_ERROR(ErrorStatus::GENERAL_FAILURE)
<< "Failed to " << (createIfNotExist ? "open or create" : "open") << " cache file "
<< filename;
}
std::vector<base::unique_fd> fds;
fds.push_back(std::move(fd));
return std::make_shared<const Handle>(Handle{
.fds = std::move(fds),
.ints = {},
});
}
// Opens a list of cache files and returns a vector of shared handles. The files
// are always opened with both read and write permissions.
static GeneralResult<std::vector<SharedHandle>> createCacheHandleVec(
uint32_t numCacheFiles, const std::string& baseFilename, bool createIfNotExist) {
CHECK(numCacheFiles <= kMaxNumberOfCacheFiles);
std::vector<SharedHandle> handles;
handles.reserve(numCacheFiles);
for (uint32_t i = 0; i < numCacheFiles; i++) {
std::string filename = baseFilename + std::to_string(i);
VLOG(COMPILATION) << "Cache " << i << ": " << filename;
handles.push_back(NN_TRY(createCacheHandle(filename, createIfNotExist)));
}
return handles;
}
// Maps a token to cache file names and returns a pair of vectors of shared
// handles to the opened files.
static GeneralResult<CacheHandles> getCacheHandles(
const CacheInfo& cacheInfo, const CacheToken& token,
const std::pair<uint32_t, uint32_t>& numCacheFiles, bool createIfNotExist) {
if (const auto* cacheHandles = std::get_if<CacheHandles>(&cacheInfo.variant)) {
if (cacheHandles->modelCache.size() != numCacheFiles.first) {
return NN_ERROR(ErrorStatus::GENERAL_FAILURE)
<< "Expected " << numCacheFiles.first << " model cache handles, got "
<< cacheHandles->modelCache.size();
}
if (cacheHandles->dataCache.size() != numCacheFiles.second) {
return NN_ERROR(ErrorStatus::GENERAL_FAILURE)
<< "Expected " << numCacheFiles.second << " data cache handles, got "
<< cacheHandles->dataCache.size();
}
return *cacheHandles;
}
// The filename includes kByteSizeOfCacheToken * 2 characters for token,
// and 1 character for model/data cache identifier.
std::string filename(kByteSizeOfCacheToken * 2 + 1, '0');
for (uint32_t i = 0; i < kByteSizeOfCacheToken; i++) {
filename[i * 2] = 'A' + (token[i] & 0x0F);
filename[i * 2 + 1] = 'A' + (token[i] >> 4);
}
const auto& cacheDir = std::get<CacheDir>(cacheInfo.variant);
CHECK(cacheDir.empty() || cacheDir.back() == '/');
std::string cacheFileName = cacheDir + filename;
const uint32_t cacheTypeIdentifierIndex = cacheDir.size() + kByteSizeOfCacheToken * 2;
cacheFileName[cacheTypeIdentifierIndex] = '1';
std::vector<SharedHandle> modelCache =
NN_TRY(createCacheHandleVec(numCacheFiles.first, cacheFileName, createIfNotExist));
cacheFileName[cacheTypeIdentifierIndex] = '2';
std::vector<SharedHandle> dataCache =
NN_TRY(createCacheHandleVec(numCacheFiles.second, cacheFileName, createIfNotExist));
return CacheHandles{
.modelCache = std::move(modelCache),
.dataCache = std::move(dataCache),
};
}
GeneralResult<SharedPreparedModel> DriverDevice::prepareModelFromCacheInternal(
const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
const CacheToken& token) const {
// Get cache files if they exist, otherwise return from the function early.
auto cache = NN_TRY(getCacheHandles(cacheInfo, token, kInterface->getNumberOfCacheFilesNeeded(),
/*createIfNotExist=*/false));
return kInterface->prepareModelFromCache(deadline, cache.modelCache, cache.dataCache, token);
}
std::pair<int, std::shared_ptr<RuntimePreparedModel>> DriverDevice::prepareModel(
const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
const std::optional<CacheToken>& maybeToken) const {
// Attempt to compile from cache if token is present.
if (maybeToken.has_value()) {
auto result = prepareModelFromCacheInternal(deadline, cacheInfo, *maybeToken);
if (result.has_value()) {
return {ANEURALNETWORKS_NO_ERROR,
std::make_shared<DriverPreparedModel>(this, std::move(result).value())};
} else {
LOG(ERROR) << "prepareModelFromCache failure (" << result.error().code
<< "): " << result.error().message;
}
}
// Get cache files if they exist, otherwise create them.
CacheHandles cache;
if (maybeToken.has_value()) {
auto result =
getCacheHandles(cacheInfo, *maybeToken, kInterface->getNumberOfCacheFilesNeeded(),
/*createIfNotExist=*/true);
if (result.has_value()) {
cache = std::move(result).value();
} else {
LOG(ERROR) << "getCacheHandles failure (" << result.error().code
<< "): " << result.error().message;
}
}
// Get the token if it exists, otherwise get a null token.
static constexpr CacheToken kNullToken = {};
const CacheToken token = maybeToken.value_or(kNullToken);
// Fallback to full compilation (possibly with token) if
// prepareModelFromCache could not be used or failed.
const Model model = makeModel();
auto result = kInterface->prepareModel(model, preference, priority, deadline, cache.modelCache,
cache.dataCache, token);
if (!result.ok()) {
LOG(ERROR) << "IDevice::prepareModel() error: " << result.error().message;
return {convertErrorStatusToResultCode(result.error().code), nullptr};
}
SharedPreparedModel preparedModel = std::move(result).value();
CHECK(preparedModel != nullptr)
<< "IDevice::prepareModel() returned nullptr without error code";
return {ANEURALNETWORKS_NO_ERROR,
std::make_shared<DriverPreparedModel>(this, std::move(preparedModel))};
}
std::pair<int, std::unique_ptr<RuntimeMemory>> DriverDevice::allocate(const MemoryDescriptor& desc,
OperandType) const {
const BufferDesc bufferDesc = {.dimensions = desc.dimensions};
std::vector<SharedPreparedModel> preparedModels(desc.preparedModels.size());
std::transform(desc.preparedModels.begin(), desc.preparedModels.end(), preparedModels.begin(),
[](const auto* preparedModel) {
const auto versionedPreparedModel = preparedModel->getInterface();
CHECK(versionedPreparedModel != nullptr);
return versionedPreparedModel;
});
auto result =
kInterface->allocate(bufferDesc, preparedModels, desc.inputRoles, desc.outputRoles);
if (!result.ok()) {
LOG(ERROR) << "DriverDevice::allocate -- memory allocation on device " << getName()
<< " failed!";
return {convertErrorStatusToResultCode(result.error().code), nullptr};
}
return MemoryFromDevice::create(std::move(result).value());
}
static Request createDriverRequest(const std::vector<ModelArgumentInfo>& inputs,
const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories) {
Request request;
request.inputs.reserve(inputs.size());
std::transform(inputs.begin(), inputs.end(), std::back_inserter(request.inputs),
[](const auto& input) { return input.createRequestArgument(); });
request.outputs.reserve(outputs.size());
std::transform(outputs.begin(), outputs.end(), std::back_inserter(request.outputs),
[](const auto& output) { return output.createRequestArgument(); });
request.pools.reserve(memories.size());
std::transform(memories.begin(), memories.end(), std::back_inserter(request.pools),
[](const RuntimeMemory* memory) { return memory->getMemoryPool(); });
return request;
}
// Perform computation on an actual device driver.
//
// Because HIDL cannot take raw pointers, two separate memory pools will be allocated for inputs and
// outputs specified by pointers. The input pointer data will be copied to the input pool prior to
// execution, and the output pointer data will be copied out from the output pool after the
// execution.
std::tuple<int, std::vector<OutputShape>, Timing> DriverPreparedModel::execute(
const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories, const SharedBurst& burstController,
MeasureTiming measure, const OptionalTimePoint& deadline,
const OptionalDuration& loopTimeoutDuration) const {
NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::execute");
auto request = createDriverRequest(inputs, outputs, memories);
NNTRACE_RT_SWITCH(NNTRACE_PHASE_EXECUTION, "DriverPreparedModel::execute::execute");
ExecutionResult<std::pair<std::vector<OutputShape>, Timing>> result;
// compute using burst if present, otherwise compute from IPreparedModel
const bool burstCompute = (burstController != nullptr);
if (burstCompute) {
for (const RuntimeMemory* memory : memories) {
const auto pool = memory->getMemoryPool();
if (const auto* maybeMemory = std::get_if<SharedMemory>(&pool)) {
auto cacheHold = burstController->cacheMemory(*maybeMemory);
memory->hold(cacheHold);
}
}
VLOG(EXECUTION) << "Before burstController->execute() " << SHOW_IF_DEBUG(request);
result = burstController->execute(request, measure, deadline, loopTimeoutDuration);
} else {
result = mPreparedModel->execute(request, measure, deadline, loopTimeoutDuration);
}
int n = ANEURALNETWORKS_OP_FAILED;
std::vector<OutputShape> outputShapes;
Timing timing;
if (result.ok()) {
n = ANEURALNETWORKS_NO_ERROR;
std::tie(outputShapes, timing) = std::move(result).value();
} else {
auto [message, code, returnedOutputShapes] = std::move(result).error();
VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")";
LOG(ERROR) << (burstCompute ? "IBurst" : "IPreparedModel")
<< "::execute(...) error: " << message;
n = convertErrorStatusToResultCode(code);
if (code == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) {
outputShapes = std::move(returnedOutputShapes);
}
return {n, std::move(outputShapes), timing};
}
VLOG(EXECUTION) << "DriverPreparedModel::execute completed";
return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
}
std::tuple<int, int, ExecuteFencedInfoCallback, Timing> DriverPreparedModel::executeFenced(
const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
MeasureTiming measure, const OptionalTimePoint& deadline,
const OptionalDuration& loopTimeoutDuration,
const OptionalDuration& timeoutDurationAfterFence) const {
NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::executeFenced");
CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd >= 0; }));
auto request = createDriverRequest(inputs, outputs, memories);
NNTRACE_RT_SWITCH(NNTRACE_PHASE_EXECUTION, "DriverPreparedModel::executeFenced");
std::vector<SyncFence> waitForHandles;
waitForHandles.reserve(waitFor.size());
for (int fd : waitFor) {
int dupFd = dup(fd);
if (dupFd < 0) {
LOG(ERROR) << "Unable to dup the file descriptor";
return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
}
waitForHandles.push_back(SyncFence::create(base::unique_fd(dupFd)));
}
SyncFence syncFence = SyncFence::createAsSignaled();
ExecuteFencedInfoCallback executeFencedInfoCallback = nullptr;
Timing timing = {};
if (mDevice->getFeatureLevel() >= kHalVersionV1_3ToApi.featureLevel) {
auto result = mPreparedModel->executeFenced(request, waitForHandles, measure, deadline,
loopTimeoutDuration, timeoutDurationAfterFence);
if (!result.ok()) {
LOG(ERROR) << "IPreparedModel::executeFenced() error: " << result.error().message;
VLOG(EXECUTION) << "**executeFenced failed**";
return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
}
std::tie(syncFence, executeFencedInfoCallback) = std::move(result).value();
} else {
// Fallback to synchronous execution if executeFenced is not supported.
// First wait for all sync fences to be ready.
LOG(INFO) << "No drivers able to handle sync fences, falling back to regular execution";
for (const auto& fence : waitForHandles) {
if (!fence.hasFd() || fence.getFd() < 0) {
return {ANEURALNETWORKS_BAD_DATA, -1, nullptr, {}};
}
auto r = fence.syncWait({/* no timeout */});
if (r != SyncFence::FenceState::SIGNALED) {
LOG(ERROR) << "syncWait failed, fd: " << fence.getFd() << ", state: " << r;
return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
}
}
auto result = mPreparedModel->execute(request, measure, deadline, loopTimeoutDuration);
if (!result.ok()) {
LOG(ERROR) << "IPreparedModel::execute() error: " << result.error().message;
return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
}
std::tie(std::ignore, timing) = result.value();
}
int syncFenceFd = -1;
if (syncFence.hasFd()) {
syncFenceFd = dup(syncFence.getFd());
if (syncFenceFd < 0) {
LOG(ERROR) << "Failed to dup the file descriptor";
return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
}
}
VLOG(EXECUTION) << "DriverPreparedModel::executeFenced completed";
return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedInfoCallback, timing};
}
std::pair<int, std::shared_ptr<RuntimeExecution>> DriverPreparedModel::createReusableExecution(
const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories, MeasureTiming measure,
const OptionalDuration& loopTimeoutDuration) const {
NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::createReusableExecution");
auto request = createDriverRequest(inputs, outputs, memories);
auto result = mPreparedModel->createReusableExecution(request, measure, loopTimeoutDuration);
if (!result.ok()) {
LOG(ERROR) << "IPreparedModel::createReusableExecution() error: " << result.error().message;
const int n = convertErrorStatusToResultCode(result.error().code);
return {n, nullptr};
}
auto execution = std::make_shared<DriverExecution>(
std::move(result).value(), std::move(request), memories, measure, loopTimeoutDuration,
mDevice->getFeatureLevel());
return {ANEURALNETWORKS_NO_ERROR, std::move(execution)};
}
std::tuple<int, std::vector<OutputShape>, Timing> DriverExecution::compute(
const SharedBurst& burstController, const OptionalTimePoint& deadline) const {
NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "DriverExecution::compute");
// compute using burst if present, otherwise compute from IPreparedModel
SharedExecution execution;
const bool burstCompute = (burstController != nullptr);
if (burstCompute) {
// create a reusable burst execution if the controller is not seen before
auto burstExecution = mCachedBurstExecutions.find(burstController.get());
if (burstExecution == mCachedBurstExecutions.end()) {
for (const RuntimeMemory* memory : kMemories) {
const auto pool = memory->getMemoryPool();
if (const auto* maybeMemory = std::get_if<SharedMemory>(&pool)) {
auto cacheHold = burstController->cacheMemory(*maybeMemory);
memory->hold(cacheHold);
}
}
auto createResult = burstController->createReusableExecution(kRequest, kMeasure,
kLoopTimeoutDuration);
if (!createResult.ok()) {
LOG(ERROR) << "IBurst::createReusableExecution() error: "
<< createResult.error().message;
const int n = convertErrorStatusToResultCode(createResult.error().code);
return {n, {}, {}};
}
execution = std::move(createResult).value();
mCachedBurstExecutions.emplace(burstController.get(), execution);
} else {
execution = burstExecution->second;
}
VLOG(EXECUTION) << "Before mBurstExecution->compute() " << SHOW_IF_DEBUG(kRequest);
} else {
execution = kExecution;
}
CHECK(execution != nullptr);
auto result = execution->compute(deadline);
if (!result.ok()) {
auto [message, code, returnedOutputShapes] = std::move(result).error();
int n = convertErrorStatusToResultCode(code);
VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")";
LOG(ERROR) << (burstCompute ? "IBurst" : "IPreparedModel")
<< "::execute(...) error: " << message;
if (code == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) {
return {n, std::move(returnedOutputShapes), {}};
}
return {n, {}, {}};
}
VLOG(EXECUTION) << "DriverExecution::compute completed";
auto [outputShapes, timing] = std::move(result).value();
return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing};
}
std::tuple<int, int, ExecuteFencedInfoCallback, Timing> DriverExecution::computeFenced(
const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
const OptionalDuration& timeoutDurationAfterFence) const {
NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "DriverExecution::computeFenced");
CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd >= 0; }));
std::vector<SyncFence> waitForHandles;
waitForHandles.reserve(waitFor.size());
for (int fd : waitFor) {
int dupFd = dup(fd);
if (dupFd < 0) {
LOG(ERROR) << "Unable to dup the file descriptor";
return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
}
waitForHandles.push_back(SyncFence::create(base::unique_fd(dupFd)));
}
SyncFence syncFence = SyncFence::createAsSignaled();
ExecuteFencedInfoCallback executeFencedInfoCallback = nullptr;
Timing timing = {};
if (kDeviceFeatureLevel >= kHalVersionV1_3ToApi.featureLevel) {
auto result =
kExecution->computeFenced(waitForHandles, deadline, timeoutDurationAfterFence);
if (!result.ok()) {
LOG(ERROR) << "IExecution::computeFenced() error: " << result.error().message;
VLOG(EXECUTION) << "**computeFenced failed**";
return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
}
std::tie(syncFence, executeFencedInfoCallback) = std::move(result).value();
} else {
// Fallback to synchronous execution if computeFenced is not supported.
// First wait for all sync fences to be ready.
LOG(INFO) << "No drivers able to handle sync fences, falling back to regular execution";
for (const auto& fence : waitForHandles) {
if (!fence.hasFd() || fence.getFd() < 0) {
return {ANEURALNETWORKS_BAD_DATA, -1, nullptr, {}};
}
auto r = fence.syncWait({/* no timeout */});
if (r != SyncFence::FenceState::SIGNALED) {
LOG(ERROR) << "syncWait failed, fd: " << fence.getFd() << ", state: " << r;
return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
}
}
auto result = kExecution->compute(deadline);
if (!result.ok()) {
LOG(ERROR) << "IExecution::compute() error: " << result.error().message;
return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}};
}
std::tie(std::ignore, timing) = result.value();
}
int syncFenceFd = -1;
if (syncFence.hasFd()) {
syncFenceFd = dup(syncFence.getFd());
if (syncFenceFd < 0) {
LOG(ERROR) << "Failed to dup the file descriptor";
return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing};
}
}
VLOG(EXECUTION) << "DriverExecution::computeFenced completed";
return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedInfoCallback, timing};
}
static Capabilities createCpuCapabilities() {
constexpr Capabilities::PerformanceInfo kPerf = {.execTime = 1.0f, .powerUsage = 1.0f};
constexpr OperandType operandTypes[] = {
OperandType::FLOAT32,
OperandType::INT32,
OperandType::UINT32,
OperandType::TENSOR_FLOAT32,
OperandType::TENSOR_INT32,
OperandType::TENSOR_QUANT8_ASYMM,
OperandType::BOOL,
OperandType::TENSOR_QUANT16_SYMM,
OperandType::TENSOR_FLOAT16,
OperandType::TENSOR_BOOL8,
OperandType::FLOAT16,
OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL,
OperandType::TENSOR_QUANT16_ASYMM,
OperandType::TENSOR_QUANT8_SYMM,
OperandType::TENSOR_QUANT8_ASYMM_SIGNED,
};
std::vector<Capabilities::OperandPerformance> operandPerformance;
operandPerformance.reserve(std::size(operandTypes));
std::transform(std::begin(operandTypes), std::end(operandTypes),
std::back_inserter(operandPerformance), [kPerf](OperandType type) {
return Capabilities::OperandPerformance{.type = type, .info = kPerf};
});
auto table =
Capabilities::OperandPerformanceTable::create(std::move(operandPerformance)).value();
return Capabilities{
.relaxedFloat32toFloat16PerformanceScalar = kPerf,
.relaxedFloat32toFloat16PerformanceTensor = kPerf,
.operandPerformance = std::move(table),
.ifPerformance = kPerf,
.whilePerformance = kPerf,
};
}
// A special abstracted device for the CPU. Only one instance of this class will exist.
// Use get() to retrieve it.
class CpuDevice : public Device {
public:
// Returns the singleton CPU fallback device.
static std::shared_ptr<CpuDevice> get() {
static std::shared_ptr<CpuDevice> instance(new CpuDevice);
return instance;
}
const std::string& getName() const override { return kName; }
const std::string& getVersionString() const override { return kVersionString; }
int64_t getFeatureLevel() const override { return kFeatureLevel; }
int32_t getType() const override { return ANEURALNETWORKS_DEVICE_CPU; }
bool isUpdatable() const override { return false; }
const std::vector<Extension>& getSupportedExtensions() const override {
return kSupportedExtensions;
}
std::vector<bool> getSupportedOperations(const MetaModel& metaModel) const override;
const Capabilities& getCapabilities() const override { return kCapabilities; }
Capabilities::PerformanceInfo getPerformance(OperandType) const override {
return kPerformance;
}
Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
return kPerformance;
}
Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override {
return kPerformance;
}
Capabilities::PerformanceInfo getIfPerformance() const override { return kPerformance; }
Capabilities::PerformanceInfo getWhilePerformance() const override { return kPerformance; }
std::pair<uint32_t, uint32_t> getNumberOfCacheFilesNeeded() const override {
return {/*numModelCache=*/0, /*numDataCache=*/0};
}
bool isCachingSupported() const override { return false; }
int wait() const override { return ANEURALNETWORKS_NO_ERROR; }
std::pair<int, std::shared_ptr<RuntimePreparedModel>> prepareModel(
const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
const OptionalTimePoint& deadline, const CacheInfo& cacheInfo,
const std::optional<CacheToken>& maybeToken) const override;
std::pair<int, std::unique_ptr<RuntimeMemory>> allocate(const MemoryDescriptor& desc,
OperandType type) const override;
private:
CpuDevice() = default;
const int64_t kFeatureLevel = kCurrentNNAPIRuntimeFeatureLevel;
const std::string kName = "nnapi-reference";
#ifndef NN_COMPATIBILITY_LIBRARY_BUILD
const std::string kVersionString = build::GetBuildNumber();
#else
const std::string kVersionString = "UNKNOWN";
#endif // NN_COMPATIBILITY_LIBRARY_BUILD
// Since the performance is a ratio compared to the CPU performance,
// by definition the performance of the CPU is 1.0.
const Capabilities::PerformanceInfo kPerformance = {.execTime = 1.0f, .powerUsage = 1.0f};
const Capabilities kCapabilities = createCpuCapabilities();
const std::vector<Extension> kSupportedExtensions{/* No extensions. */};
};
// A special abstracted RuntimePreparedModel for the CPU, constructed by CpuDevice.
class CpuPreparedModel : public RuntimePreparedModel {
public:
// Factory method for CpuPreparedModel. Returns ANEURALNETWORKS_NO_ERROR and
// a prepared model object if successfully created. Returns an error code
// and nullptr otherwise.
static std::pair<int, std::shared_ptr<RuntimePreparedModel>> create(Model model);
const Device* getDevice() const override { return CpuDevice::get().get(); }
SharedPreparedModel getInterface() const override { return nullptr; }
std::tuple<int, std::vector<OutputShape>, Timing> execute(
const std::vector<ModelArgumentInfo>& inputs,
const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories, const SharedBurst& burstController,
MeasureTiming measure, const OptionalTimePoint& deadline,
const OptionalDuration& loopTimeoutDuration) const override;
GeneralResult<SharedBurst> configureExecutionBurst() const override { return nullptr; }
std::tuple<int, int, ExecuteFencedInfoCallback, Timing> executeFenced(
const std::vector<ModelArgumentInfo>& inputs,
const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
MeasureTiming measure, const OptionalTimePoint& deadline,
const OptionalDuration& loopTimeoutDuration,
const OptionalDuration& timeoutDurationAfterFence) const override;
std::pair<int, std::shared_ptr<RuntimeExecution>> createReusableExecution(
const std::vector<ModelArgumentInfo>& inputs,
const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories, MeasureTiming measure,
const OptionalDuration& loopTimeoutDuration) const override;
MemoryPreference getMemoryPreference() const override {
return {kPreferredAlignment, kPreferredPadding};
}
// Prefer to use CpuPreparedModel::create.
CpuPreparedModel(Model model, std::vector<RunTimePoolInfo> poolInfos)
: mModel(std::move(model)), mModelPoolInfos(std::move(poolInfos)) {}
const Model& getModel() const { return mModel; }
const std::vector<RunTimePoolInfo>& getModelPoolInfos() const { return mModelPoolInfos; }
private:
// TFLite kernels prefers 64 bytes for padding and alignment.
static constexpr uint32_t kPreferredAlignment = 64;
static constexpr uint32_t kPreferredPadding = 64;
const Model mModel;
const std::vector<RunTimePoolInfo> mModelPoolInfos;
};
class CpuExecution : public RuntimeExecution {
public:
CpuExecution(const CpuPreparedModel& preparedModel, Request request,
std::vector<RunTimePoolInfo> requestPoolInfos,
OptionalDuration loopTimeoutDuration)
: kPreparedModel(preparedModel),
kRequest(std::move(request)),
kRequestPoolInfos(std::move(requestPoolInfos)),
kLoopTimeoutDuration(std::move(loopTimeoutDuration)) {}
std::tuple<int, std::vector<OutputShape>, Timing> compute(
const SharedBurst& burstController, const OptionalTimePoint& deadline) const override;
std::tuple<int, int, ExecuteFencedInfoCallback, Timing> computeFenced(
const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
const OptionalDuration& timeoutDurationAfterFence) const override;
private:
const CpuPreparedModel& kPreparedModel;
Request kRequest;
std::vector<RunTimePoolInfo> kRequestPoolInfos;
const OptionalDuration kLoopTimeoutDuration;
};
std::vector<bool> CpuDevice::getSupportedOperations(const MetaModel& metaModel) const {
const Model& model = metaModel.getModel();
const size_t count = model.main.operations.size();
std::vector<bool> result(count, false);
for (size_t i = 0; i < count; i++) {
// TODO(b/119870033): Decide whether and how post-P operations would be supported on CPU.
// We may want to use the slicer for CpuDevice just as we do for
// DriverDevice.
OperationType operationType = model.main.operations[i].type;
result[i] = !isExtension(operationType) && operationType != OperationType::OEM_OPERATION;
}
return result;
}
std::pair<int, std::shared_ptr<RuntimePreparedModel>> CpuDevice::prepareModel(
const ModelFactory& makeModel, ExecutionPreference preference, Priority priority,
const OptionalTimePoint& deadline, const CacheInfo& /*cacheInfo*/,
const std::optional<CacheToken>& maybeToken) const {
CHECK(!maybeToken.has_value())
<< "Should never call prepareModel with cache information on CpuDevice";
const Model model = makeModel();
if (auto result = validate(model); !result.ok()) {
LOG(ERROR) << "Invalid Model: " << result.error();
return {ANEURALNETWORKS_OP_FAILED, nullptr};
}
if (auto result = validate(preference); !result.ok()) {
LOG(ERROR) << "Invalid ExecutionPreference: " << result.error();
return {ANEURALNETWORKS_OP_FAILED, nullptr};
}
if (auto result = validate(priority); !result.ok()) {
LOG(ERROR) << "Invalid Priority: " << result.error();
return {ANEURALNETWORKS_OP_FAILED, nullptr};
}
if (hasDeadlinePassed(deadline)) {
return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, nullptr};
}
return CpuPreparedModel::create(model);
}
std::pair<int, std::unique_ptr<RuntimeMemory>> CpuDevice::allocate(const MemoryDescriptor& desc,
OperandType type) const {
uint32_t size = TypeManager::get()->getSizeOfData(type, desc.dimensions);
if (size == 0) {
LOG(ERROR) << "CpuDevice::allocate -- does not support unknown dimensions.";
return {ANEURALNETWORKS_OP_FAILED, nullptr};
}
return MemoryAshmem::create(size);
}
std::pair<int, std::shared_ptr<RuntimePreparedModel>> CpuPreparedModel::create(Model model) {
std::vector<RunTimePoolInfo> poolInfos;
if (!setRunTimePoolInfosFromCanonicalMemories(&poolInfos, model.pools)) {
return {ANEURALNETWORKS_UNMAPPABLE, nullptr};
}
std::shared_ptr<RuntimePreparedModel> preparedModel =
std::make_shared<CpuPreparedModel>(std::move(model), std::move(poolInfos));
return {ANEURALNETWORKS_NO_ERROR, std::move(preparedModel)};
}
static std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpu(
const Model& model, const Request& request,
const std::vector<RunTimePoolInfo>& modelPoolInfos,
const std::vector<RunTimePoolInfo>& requestPoolInfos, const OptionalTimePoint& deadline,
const OptionalDuration& loopTimeoutDuration) {
NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu");
CpuExecutor executor;
if (loopTimeoutDuration.has_value()) {
executor.setLoopTimeout(loopTimeoutDuration->count());
}
if (deadline.has_value()) {
executor.setDeadline(*deadline);
}
int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
const auto& outputShapes = executor.getOutputShapes();
return {err, outputShapes, {}};
}
std::tuple<int, int, ExecuteFencedInfoCallback, Timing> CpuPreparedModel::executeFenced(
const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories, const std::vector<int>& waitFor,
MeasureTiming measure, const OptionalTimePoint& deadline,
const OptionalDuration& loopTimeoutDuration, const OptionalDuration& duration) const {
VLOG(EXECUTION)
<< "CpuPreparedModel::executeFenced wait for sync fences to signal before execution";
for (int syncFd : waitFor) {
if (syncFd > 0) {
auto r = syncWait(syncFd, -1);
if (r != FenceState::SIGNALED) {
LOG(ERROR) << "sync wait failed, fd: " << syncFd;
return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
}
}
}
// Update deadline if the timeout duration is closer than the deadline.
auto closestDeadline = deadline;
if (duration.has_value()) {
const auto timeoutDurationDeadline = makeDeadline(*duration);
if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
closestDeadline = timeoutDurationDeadline;
}
}
const auto [result, outputShapes, timing] = execute(inputs, outputs, memories, nullptr, measure,
closestDeadline, loopTimeoutDuration);
return {result, -1, nullptr, timing};
}
static std::tuple<int, Request, std::vector<RunTimePoolInfo>> createCpuRequest(
const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories) {
std::vector<RunTimePoolInfo> requestPoolInfos;
requestPoolInfos.reserve(memories.size());
for (const RuntimeMemory* mem : memories) {
if (std::optional<RunTimePoolInfo> poolInfo = mem->getRunTimePoolInfo()) {
requestPoolInfos.emplace_back(*poolInfo);
} else {
return {ANEURALNETWORKS_UNMAPPABLE, {}, {}};
}
}
// Create as many pools as there are input / output.
auto fixPointerArguments =
[&requestPoolInfos](const std::vector<ModelArgumentInfo>& argumentInfos) {
std::vector<DataLocation> ptrArgsLocations;
for (const ModelArgumentInfo& argumentInfo : argumentInfos) {
if (argumentInfo.state() == ModelArgumentInfo::POINTER) {
ptrArgsLocations.push_back(
{.poolIndex = static_cast<uint32_t>(requestPoolInfos.size()),
.offset = 0,
.length = argumentInfo.length(),
.padding = argumentInfo.padding()});
requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer(
static_cast<uint8_t*>(argumentInfo.buffer())));
}
}
return ptrArgsLocations;
};
const std::vector<DataLocation> inputPtrArgsLocations = fixPointerArguments(inputs);
const std::vector<DataLocation> outputPtrArgsLocations = fixPointerArguments(outputs);
Request request;
request.inputs = createRequestArguments(inputs, inputPtrArgsLocations);
request.outputs = createRequestArguments(outputs, outputPtrArgsLocations);
return {ANEURALNETWORKS_NO_ERROR, std::move(request), std::move(requestPoolInfos)};
}
// Perform computation on NNAPI CPU reference implementation.
//
// Contrary to DriverPreparedModel::execute, the NNAPI CPU reference executor lives in the
// same process as the NNAPI runtime and can take raw pointers. We will create as many pools as
// there are input/output in this method to avoid data copying.
//
// Will choose between sync/async execution according to DeviceManager::mSyncExecCpu.
std::tuple<int, std::vector<OutputShape>, Timing> CpuPreparedModel::execute(
const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories, const SharedBurst& /*burstController*/,
MeasureTiming /*measure*/, const OptionalTimePoint& deadline,
const OptionalDuration& loopTimeoutDuration) const {
if (hasDeadlinePassed(deadline)) {
return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, {}};
}
int nCreateRequest;
Request request;
std::vector<RunTimePoolInfo> requestPoolInfos;
std::tie(nCreateRequest, request, requestPoolInfos) =
createCpuRequest(inputs, outputs, memories);
if (nCreateRequest != ANEURALNETWORKS_NO_ERROR) {
return {nCreateRequest, {}, {}};
}
if (!DeviceManager::get()->syncExecCpu()) {
// TODO: use a thread pool
// TODO(mikie): this could have NNTRACE so we could measure the overhead
// of spinning up a new thread.
std::tuple<int, std::vector<OutputShape>, Timing> result = {};
std::thread([this, &request, &requestPoolInfos, &deadline, &loopTimeoutDuration, &result] {
result = computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
loopTimeoutDuration);
}).join();
return result;
}
return computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline,
loopTimeoutDuration);
}
std::pair<int, std::shared_ptr<RuntimeExecution>> CpuPreparedModel::createReusableExecution(
const std::vector<ModelArgumentInfo>& inputs, const std::vector<ModelArgumentInfo>& outputs,
const std::vector<const RuntimeMemory*>& memories, MeasureTiming /*measure*/,
const OptionalDuration& loopTimeoutDuration) const {
auto [nCreateRequest, request, requestPoolInfos] = createCpuRequest(inputs, outputs, memories);
if (nCreateRequest != ANEURALNETWORKS_NO_ERROR) {
return {nCreateRequest, nullptr};
}
auto execution = std::make_shared<CpuExecution>(
*this, std::move(request), std::move(requestPoolInfos), loopTimeoutDuration);
return {ANEURALNETWORKS_NO_ERROR, std::move(execution)};
}
std::tuple<int, std::vector<OutputShape>, Timing> CpuExecution::compute(
const SharedBurst& /*burstController*/, const OptionalTimePoint& deadline) const {
if (hasDeadlinePassed(deadline)) {
return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, {}};
}
if (!DeviceManager::get()->syncExecCpu()) {
// TODO: use a thread pool
// TODO(mikie): this could have NNTRACE so we could measure the overhead
// of spinning up a new thread.
std::tuple<int, std::vector<OutputShape>, Timing> result = {};
std::thread([this, &deadline, &result] {
result = computeOnCpu(kPreparedModel.getModel(), kRequest,
kPreparedModel.getModelPoolInfos(), kRequestPoolInfos, deadline,
kLoopTimeoutDuration);
}).join();
return result;
}
return computeOnCpu(kPreparedModel.getModel(), kRequest, kPreparedModel.getModelPoolInfos(),
kRequestPoolInfos, deadline, kLoopTimeoutDuration);
}
std::tuple<int, int, ExecuteFencedInfoCallback, Timing> CpuExecution::computeFenced(
const std::vector<int>& waitFor, const OptionalTimePoint& deadline,
const OptionalDuration& duration) const {
VLOG(EXECUTION)
<< "CpuExecution::computeFenced wait for sync fences to signal before execution";
for (int syncFd : waitFor) {
if (syncFd > 0) {
auto r = syncWait(syncFd, -1);
if (r != FenceState::SIGNALED) {
LOG(ERROR) << "sync wait failed, fd: " << syncFd;
return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}};
}
}
}
// Update deadline if the timeout duration is closer than the deadline.
auto closestDeadline = deadline;
if (duration.has_value()) {
const auto timeoutDurationDeadline = makeDeadline(*duration);
if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) {
closestDeadline = timeoutDurationDeadline;
}
}
const auto [result, outputShapes, timing] = compute(nullptr, closestDeadline);
return {result, -1, nullptr, timing};
}
DeviceManager* DeviceManager::get() {
static DeviceManager manager;
return &manager;
}
std::shared_ptr<Device> DeviceManager::getCpuDevice() {
return CpuDevice::get();
}
std::shared_ptr<Device> DeviceManager::forTest_makeDriverDevice(const SharedDevice& device) {
VLOG(MANAGER) << "forTest_makeDriverDevice(" << device->getName() << ")";
const auto driverDevice = DriverDevice::create(device);
CHECK(driverDevice != nullptr);
return driverDevice;
}
#ifndef NN_COMPATIBILITY_LIBRARY_BUILD
std::vector<std::shared_ptr<DriverDevice>> getDriverDevices() {
const auto& appInfo = AppInfoFetcher::get()->getAppInfo();
const bool currentProcessIsOnThePlatform =
appInfo.appIsSystemApp || appInfo.appIsOnVendorImage || appInfo.appIsOnProductImage;
const bool includeUpdatableDrivers = !currentProcessIsOnThePlatform;
auto devicesAndUpdatability =
hardware::neuralnetworks::service::getDevices(includeUpdatableDrivers);
std::vector<std::shared_ptr<DriverDevice>> driverDevices;
driverDevices.reserve(devicesAndUpdatability.size());
for (auto& [device, isDeviceUpdatable] : devicesAndUpdatability) {
driverDevices.push_back(DriverDevice::create(std::move(device), isDeviceUpdatable));
}
return driverDevices;
}
#else
std::vector<std::shared_ptr<DriverDevice>> getDriverDevices() {
auto devices = getDevices();
std::vector<std::shared_ptr<DriverDevice>> driverDevices;
driverDevices.reserve(devices.size());
for (auto& device : devices) {
driverDevices.push_back(DriverDevice::create(std::move(device)));
}
return driverDevices;
}
#endif // NN_COMPATIBILITY_LIBRARY_BUILD
void DeviceManager::findAvailableDevices() {
VLOG(MANAGER) << "findAvailableDevices";
// register driver devices
auto driverDevices = getDriverDevices();
for (auto& driverDevice : driverDevices) {
VLOG(MANAGER) << "Found interface " << driverDevice->getName();
mDevices.push_back(std::move(driverDevice));
}
#ifndef NN_COMPATIBILITY_LIBRARY_BUILD
// register CPU fallback device
mDevices.push_back(CpuDevice::get());
mDevicesCpuOnly.push_back(CpuDevice::get());
#endif // NN_COMPATIBILITY_LIBRARY_BUILD
}
void DeviceManager::registerDevice(const SharedDevice& device) {
if (auto driverDevice = DriverDevice::create(device)) {
mDevices.push_back(std::move(driverDevice));
}
}
DeviceManager::DeviceManager() {
VLOG(MANAGER) << "DeviceManager::DeviceManager";
findAvailableDevices();
#ifdef NN_DEBUGGABLE
mStrictSlicing = (getProp("debug.nn.strict-slicing") != 0);
mPartitioning = getProp("debug.nn.partition", kPartitioningDefault);
mDebugNNCpuOnly = (getProp("debug.nn.cpuonly") != 0);
mSyncExecCpu = (getProp("debug.nn.syncexec-cpu", 1) != 0);
mSyncExecRuntime = (getProp("debug.nn.syncexec-runtime") != 0);
#endif // NN_DEBUGGABLE
}
} // namespace nn
} // namespace android