/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #define LOG_TAG "Manager" #include "Manager.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ExecutionCallback.h" #include "FeatureLevel.h" #include "Memory.h" #include "ModelArgumentInfo.h" #include "TypeManager.h" #ifndef NN_COMPATIBILITY_LIBRARY_BUILD #include #include #include #include #include "AppInfoFetcher.h" #endif // NN_COMPATIBILITY_LIBRARY_BUILD namespace android { namespace nn { // A Device with actual underlying driver class DriverDevice : public Device { public: // Create a DriverDevice from a name and a DeviceFactory function. // Returns nullptr on failure. static std::shared_ptr create(SharedDevice device, bool isUpdatable = false); // Prefer using DriverDevice::create explicit DriverDevice(SharedDevice device, bool isUpdatable); const std::string& getName() const override { return kInterface->getName(); } const std::string& getVersionString() const override { return kInterface->getVersionString(); } int64_t getFeatureLevel() const override; int32_t getType() const override { return static_cast(kInterface->getType()); } bool isUpdatable() const override { return kIsUpdatable; } const std::vector& getSupportedExtensions() const override { return kInterface->getSupportedExtensions(); } std::vector getSupportedOperations(const MetaModel& metaModel) const override; const Capabilities& getCapabilities() const override { return kInterface->getCapabilities(); } Capabilities::PerformanceInfo getPerformance(OperandType type) const override { return getCapabilities().operandPerformance.lookup(type); } Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override { return getCapabilities().relaxedFloat32toFloat16PerformanceScalar; } Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override { return getCapabilities().relaxedFloat32toFloat16PerformanceTensor; } Capabilities::PerformanceInfo getIfPerformance() const override { return getCapabilities().ifPerformance; } Capabilities::PerformanceInfo getWhilePerformance() const override { return getCapabilities().whilePerformance; } std::pair getNumberOfCacheFilesNeeded() const override { return kInterface->getNumberOfCacheFilesNeeded(); } bool isCachingSupported() const override { // Caching is supported if either of numModelCache or numDataCache is greater than 0. const auto [numModelCacheFiles, numDataCacheFiles] = getNumberOfCacheFilesNeeded(); return numModelCacheFiles > 0 || numDataCacheFiles > 0; } int wait() const override { auto result = kInterface->wait(); if (!result.ok()) { LOG(ERROR) << "DriverDevice::wait error: " << result.error().message; return convertErrorStatusToResultCode(result.error().code); } return ANEURALNETWORKS_NO_ERROR; } std::pair> prepareModel( const ModelFactory& makeModel, ExecutionPreference preference, Priority priority, const OptionalTimePoint& deadline, const CacheInfo& cacheInfo, const std::optional& maybeToken) const override; std::pair> allocate(const MemoryDescriptor& desc, OperandType) const override; private: const SharedDevice kInterface; const bool kIsUpdatable; GeneralResult> getSupportedOperationsImpl(const MetaModel& metaModel) const; GeneralResult prepareModelFromCacheInternal( const OptionalTimePoint& deadline, const CacheInfo& cacheInfo, const CacheToken& token) const; #ifdef NN_DEBUGGABLE // For debugging: behavior of IDevice::getSupportedOperations for SampleDriver. // 0 - all operations reported by IDevice::getSupportedOperations() supported // 1 - some operations reported by IDevice::getSupportedOperations() supported uint32_t mSupported = 0; #endif // NN_DEBUGGABLE }; // A RuntimePreparedModel with underlying IPreparedModel instance return by actual driver. class DriverPreparedModel : public RuntimePreparedModel { public: DriverPreparedModel(const Device* device, const SharedPreparedModel& preparedModel) : mDevice(device), mPreparedModel(preparedModel) { CHECK(mDevice != nullptr); CHECK(mPreparedModel != nullptr); } const Device* getDevice() const override { return mDevice; } SharedPreparedModel getInterface() const override { return mPreparedModel; } std::tuple, Timing> execute( const std::vector& inputs, const std::vector& outputs, const std::vector& memories, const SharedBurst& burstController, MeasureTiming measure, const OptionalTimePoint& deadline, const OptionalDuration& loopTimeoutDuration) const override; std::tuple executeFenced( const std::vector& inputs, const std::vector& outputs, const std::vector& memories, const std::vector& waitFor, MeasureTiming measure, const OptionalTimePoint& deadline, const OptionalDuration& loopTimeoutDuration, const OptionalDuration& timeoutDurationAfterFence) const override; std::pair> createReusableExecution( const std::vector& inputs, const std::vector& outputs, const std::vector& memories, MeasureTiming measure, const OptionalDuration& loopTimeoutDuration) const override; GeneralResult configureExecutionBurst() const override { return mPreparedModel->configureExecutionBurst(); } MemoryPreference getMemoryPreference() const override { if (mDevice->getFeatureLevel() >= ANEURALNETWORKS_FEATURE_LEVEL_5) { return {kDefaultRequestMemoryAlignment, kDefaultRequestMemoryPadding}; } else { // We are not able to pass memory padding information to HIDL drivers, so return the // minimum padding. return {kDefaultRequestMemoryAlignment, kMinMemoryPadding}; } } private: const Device* mDevice; const SharedPreparedModel mPreparedModel; }; class DriverExecution : public RuntimeExecution { public: DriverExecution(SharedExecution execution, Request request, std::vector memories, MeasureTiming measure, OptionalDuration loopTimeoutDuration, int64_t deviceFeatureLevel) : kExecution(std::move(execution)), kRequest(std::move(request)), kMemories(std::move(memories)), kMeasure(measure), kLoopTimeoutDuration(std::move(loopTimeoutDuration)), kDeviceFeatureLevel(deviceFeatureLevel) { CHECK(kExecution != nullptr); } std::tuple, Timing> compute( const SharedBurst& burstController, const OptionalTimePoint& deadline) const override; std::tuple computeFenced( const std::vector& waitFor, const OptionalTimePoint& deadline, const OptionalDuration& timeoutDurationAfterFence) const override; private: const SharedExecution kExecution; // For burst execution. const Request kRequest; const std::vector kMemories; const MeasureTiming kMeasure; const OptionalDuration kLoopTimeoutDuration; mutable std::map mCachedBurstExecutions; // For fenced execution. const int64_t kDeviceFeatureLevel; }; DriverDevice::DriverDevice(SharedDevice device, bool isUpdatable) : kInterface(std::move(device)), kIsUpdatable(isUpdatable) { CHECK(kInterface != nullptr); #ifdef NN_DEBUGGABLE static const char samplePrefix[] = "sample"; if (getName().substr(0, sizeof(samplePrefix) - 1) == samplePrefix) { mSupported = getProp("debug.nn.sample.supported"); } #endif // NN_DEBUGGABLE } std::shared_ptr DriverDevice::create(SharedDevice device, bool isUpdatable) { if (device == nullptr) { LOG(ERROR) << "DriverDevice::create called with nullptr"; return nullptr; } return std::make_shared(std::move(device), isUpdatable); } int64_t DriverDevice::getFeatureLevel() const { Version featureLevel = kInterface->getFeatureLevel(); switch (featureLevel) { case Version::ANDROID_OC_MR1: return ANEURALNETWORKS_FEATURE_LEVEL_1; case Version::ANDROID_P: return ANEURALNETWORKS_FEATURE_LEVEL_2; case Version::ANDROID_Q: return ANEURALNETWORKS_FEATURE_LEVEL_3; case Version::ANDROID_R: return ANEURALNETWORKS_FEATURE_LEVEL_4; case Version::ANDROID_S: return ANEURALNETWORKS_FEATURE_LEVEL_5; case Version::CURRENT_RUNTIME: break; } LOG(FATAL) << "Unsupported driver feature level: " << featureLevel; return -1; } GeneralResult> DriverDevice::getSupportedOperationsImpl( const MetaModel& metaModel) const { const auto featureLevel = kInterface->getFeatureLevel(); const auto slice = metaModel.getSlice(featureLevel); if (!slice.has_value()) { return NN_ERROR() << "getSlice(" << featureLevel << ") failed"; } const auto& [sliceModel, slicedModelOperationIndexToModelOperationIndex] = *slice; const std::vector supported = NN_TRY(kInterface->getSupportedOperations(sliceModel)); const uint32_t slicedOperationCount = sliceModel.main.operations.size(); if (supported.size() != slicedOperationCount) { return NN_ERROR() << "IDevice::getSupportedOperations returned a vector of length " << supported.size() << " when expecting " << slicedOperationCount; } const Model& model = metaModel.getModel(); const uint32_t operationCount = model.main.operations.size(); std::vector remappedSupported(operationCount, false); for (size_t i = 0; i < supported.size(); ++i) { if (supported[i]) { remappedSupported[slicedModelOperationIndexToModelOperationIndex(i)] = true; } } return remappedSupported; } std::vector DriverDevice::getSupportedOperations(const MetaModel& metaModel) const { const Model& model = metaModel.getModel(); auto result = getSupportedOperationsImpl(metaModel); if (!result.ok()) { LOG(ERROR) << "getSupportedOperations failed with code " << result.error().code << ": " << result.error().message; // Set the supported operation vectors to all false, so we won't use this driver. return std::vector(model.main.operations.size(), false); } std::vector& supportedOperations = result.value(); #ifdef NN_DEBUGGABLE if (mSupported != 1) { return supportedOperations; } const uint32_t baseAccumulator = std::hash{}(getName()); for (size_t operationIndex = 0; operationIndex < supportedOperations.size(); operationIndex++) { if (!supportedOperations[operationIndex]) { continue; } uint32_t accumulator = baseAccumulator; const Operation& operation = model.main.operations[operationIndex]; accumulator ^= static_cast(operation.type); auto accumulateOperands = [&model, &accumulator](const std::vector& operands) { for (uint32_t operandIndex : operands) { const Operand& operand = model.main.operands[operandIndex]; accumulator ^= static_cast(operand.type); accumulator ^= operand.dimensions.size(); for (const Dimension& dimension : operand.dimensions) { accumulator ^= dimension; if (operand.lifetime == Operand::LifeTime::CONSTANT_COPY || operand.lifetime == Operand::LifeTime::CONSTANT_REFERENCE || operand.lifetime == Operand::LifeTime::POINTER) { accumulator ^= 1; } } } }; accumulateOperands(operation.inputs); accumulateOperands(operation.outputs); if (accumulator & 1) { supportedOperations[operationIndex] = false; } } #endif // NN_DEBUGGABLE return supportedOperations; } // Opens a cache file for reading and writing and returns a shared handle. static GeneralResult createCacheHandle(const std::string& filename, bool createIfNotExist) { auto fd = base::unique_fd(open(filename.c_str(), createIfNotExist ? (O_RDWR | O_CREAT) : O_RDWR, S_IRUSR | S_IWUSR)); if (fd.get() == -1) { return NN_ERROR(ErrorStatus::GENERAL_FAILURE) << "Failed to " << (createIfNotExist ? "open or create" : "open") << " cache file " << filename; } std::vector fds; fds.push_back(std::move(fd)); return std::make_shared(Handle{ .fds = std::move(fds), .ints = {}, }); } // Opens a list of cache files and returns a vector of shared handles. The files // are always opened with both read and write permissions. static GeneralResult> createCacheHandleVec( uint32_t numCacheFiles, const std::string& baseFilename, bool createIfNotExist) { CHECK(numCacheFiles <= kMaxNumberOfCacheFiles); std::vector handles; handles.reserve(numCacheFiles); for (uint32_t i = 0; i < numCacheFiles; i++) { std::string filename = baseFilename + std::to_string(i); VLOG(COMPILATION) << "Cache " << i << ": " << filename; handles.push_back(NN_TRY(createCacheHandle(filename, createIfNotExist))); } return handles; } // Maps a token to cache file names and returns a pair of vectors of shared // handles to the opened files. static GeneralResult getCacheHandles( const CacheInfo& cacheInfo, const CacheToken& token, const std::pair& numCacheFiles, bool createIfNotExist) { if (const auto* cacheHandles = std::get_if(&cacheInfo.variant)) { if (cacheHandles->modelCache.size() != numCacheFiles.first) { return NN_ERROR(ErrorStatus::GENERAL_FAILURE) << "Expected " << numCacheFiles.first << " model cache handles, got " << cacheHandles->modelCache.size(); } if (cacheHandles->dataCache.size() != numCacheFiles.second) { return NN_ERROR(ErrorStatus::GENERAL_FAILURE) << "Expected " << numCacheFiles.second << " data cache handles, got " << cacheHandles->dataCache.size(); } return *cacheHandles; } // The filename includes kByteSizeOfCacheToken * 2 characters for token, // and 1 character for model/data cache identifier. std::string filename(kByteSizeOfCacheToken * 2 + 1, '0'); for (uint32_t i = 0; i < kByteSizeOfCacheToken; i++) { filename[i * 2] = 'A' + (token[i] & 0x0F); filename[i * 2 + 1] = 'A' + (token[i] >> 4); } const auto& cacheDir = std::get(cacheInfo.variant); CHECK(cacheDir.empty() || cacheDir.back() == '/'); std::string cacheFileName = cacheDir + filename; const uint32_t cacheTypeIdentifierIndex = cacheDir.size() + kByteSizeOfCacheToken * 2; cacheFileName[cacheTypeIdentifierIndex] = '1'; std::vector modelCache = NN_TRY(createCacheHandleVec(numCacheFiles.first, cacheFileName, createIfNotExist)); cacheFileName[cacheTypeIdentifierIndex] = '2'; std::vector dataCache = NN_TRY(createCacheHandleVec(numCacheFiles.second, cacheFileName, createIfNotExist)); return CacheHandles{ .modelCache = std::move(modelCache), .dataCache = std::move(dataCache), }; } GeneralResult DriverDevice::prepareModelFromCacheInternal( const OptionalTimePoint& deadline, const CacheInfo& cacheInfo, const CacheToken& token) const { // Get cache files if they exist, otherwise return from the function early. auto cache = NN_TRY(getCacheHandles(cacheInfo, token, kInterface->getNumberOfCacheFilesNeeded(), /*createIfNotExist=*/false)); return kInterface->prepareModelFromCache(deadline, cache.modelCache, cache.dataCache, token); } std::pair> DriverDevice::prepareModel( const ModelFactory& makeModel, ExecutionPreference preference, Priority priority, const OptionalTimePoint& deadline, const CacheInfo& cacheInfo, const std::optional& maybeToken) const { // Attempt to compile from cache if token is present. if (maybeToken.has_value()) { auto result = prepareModelFromCacheInternal(deadline, cacheInfo, *maybeToken); if (result.has_value()) { return {ANEURALNETWORKS_NO_ERROR, std::make_shared(this, std::move(result).value())}; } else { LOG(ERROR) << "prepareModelFromCache failure (" << result.error().code << "): " << result.error().message; } } // Get cache files if they exist, otherwise create them. CacheHandles cache; if (maybeToken.has_value()) { auto result = getCacheHandles(cacheInfo, *maybeToken, kInterface->getNumberOfCacheFilesNeeded(), /*createIfNotExist=*/true); if (result.has_value()) { cache = std::move(result).value(); } else { LOG(ERROR) << "getCacheHandles failure (" << result.error().code << "): " << result.error().message; } } // Get the token if it exists, otherwise get a null token. static constexpr CacheToken kNullToken = {}; const CacheToken token = maybeToken.value_or(kNullToken); // Fallback to full compilation (possibly with token) if // prepareModelFromCache could not be used or failed. const Model model = makeModel(); auto result = kInterface->prepareModel(model, preference, priority, deadline, cache.modelCache, cache.dataCache, token); if (!result.ok()) { LOG(ERROR) << "IDevice::prepareModel() error: " << result.error().message; return {convertErrorStatusToResultCode(result.error().code), nullptr}; } SharedPreparedModel preparedModel = std::move(result).value(); CHECK(preparedModel != nullptr) << "IDevice::prepareModel() returned nullptr without error code"; return {ANEURALNETWORKS_NO_ERROR, std::make_shared(this, std::move(preparedModel))}; } std::pair> DriverDevice::allocate(const MemoryDescriptor& desc, OperandType) const { const BufferDesc bufferDesc = {.dimensions = desc.dimensions}; std::vector preparedModels(desc.preparedModels.size()); std::transform(desc.preparedModels.begin(), desc.preparedModels.end(), preparedModels.begin(), [](const auto* preparedModel) { const auto versionedPreparedModel = preparedModel->getInterface(); CHECK(versionedPreparedModel != nullptr); return versionedPreparedModel; }); auto result = kInterface->allocate(bufferDesc, preparedModels, desc.inputRoles, desc.outputRoles); if (!result.ok()) { LOG(ERROR) << "DriverDevice::allocate -- memory allocation on device " << getName() << " failed!"; return {convertErrorStatusToResultCode(result.error().code), nullptr}; } return MemoryFromDevice::create(std::move(result).value()); } static Request createDriverRequest(const std::vector& inputs, const std::vector& outputs, const std::vector& memories) { Request request; request.inputs.reserve(inputs.size()); std::transform(inputs.begin(), inputs.end(), std::back_inserter(request.inputs), [](const auto& input) { return input.createRequestArgument(); }); request.outputs.reserve(outputs.size()); std::transform(outputs.begin(), outputs.end(), std::back_inserter(request.outputs), [](const auto& output) { return output.createRequestArgument(); }); request.pools.reserve(memories.size()); std::transform(memories.begin(), memories.end(), std::back_inserter(request.pools), [](const RuntimeMemory* memory) { return memory->getMemoryPool(); }); return request; } // Perform computation on an actual device driver. // // Because HIDL cannot take raw pointers, two separate memory pools will be allocated for inputs and // outputs specified by pointers. The input pointer data will be copied to the input pool prior to // execution, and the output pointer data will be copied out from the output pool after the // execution. std::tuple, Timing> DriverPreparedModel::execute( const std::vector& inputs, const std::vector& outputs, const std::vector& memories, const SharedBurst& burstController, MeasureTiming measure, const OptionalTimePoint& deadline, const OptionalDuration& loopTimeoutDuration) const { NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::execute"); auto request = createDriverRequest(inputs, outputs, memories); NNTRACE_RT_SWITCH(NNTRACE_PHASE_EXECUTION, "DriverPreparedModel::execute::execute"); ExecutionResult, Timing>> result; // compute using burst if present, otherwise compute from IPreparedModel const bool burstCompute = (burstController != nullptr); if (burstCompute) { for (const RuntimeMemory* memory : memories) { const auto pool = memory->getMemoryPool(); if (const auto* maybeMemory = std::get_if(&pool)) { auto cacheHold = burstController->cacheMemory(*maybeMemory); memory->hold(cacheHold); } } VLOG(EXECUTION) << "Before burstController->execute() " << SHOW_IF_DEBUG(request); result = burstController->execute(request, measure, deadline, loopTimeoutDuration); } else { result = mPreparedModel->execute(request, measure, deadline, loopTimeoutDuration); } int n = ANEURALNETWORKS_OP_FAILED; std::vector outputShapes; Timing timing; if (result.ok()) { n = ANEURALNETWORKS_NO_ERROR; std::tie(outputShapes, timing) = std::move(result).value(); } else { auto [message, code, returnedOutputShapes] = std::move(result).error(); VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")"; LOG(ERROR) << (burstCompute ? "IBurst" : "IPreparedModel") << "::execute(...) error: " << message; n = convertErrorStatusToResultCode(code); if (code == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) { outputShapes = std::move(returnedOutputShapes); } return {n, std::move(outputShapes), timing}; } VLOG(EXECUTION) << "DriverPreparedModel::execute completed"; return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing}; } std::tuple DriverPreparedModel::executeFenced( const std::vector& inputs, const std::vector& outputs, const std::vector& memories, const std::vector& waitFor, MeasureTiming measure, const OptionalTimePoint& deadline, const OptionalDuration& loopTimeoutDuration, const OptionalDuration& timeoutDurationAfterFence) const { NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::executeFenced"); CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd >= 0; })); auto request = createDriverRequest(inputs, outputs, memories); NNTRACE_RT_SWITCH(NNTRACE_PHASE_EXECUTION, "DriverPreparedModel::executeFenced"); std::vector waitForHandles; waitForHandles.reserve(waitFor.size()); for (int fd : waitFor) { int dupFd = dup(fd); if (dupFd < 0) { LOG(ERROR) << "Unable to dup the file descriptor"; return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}}; } waitForHandles.push_back(SyncFence::create(base::unique_fd(dupFd))); } SyncFence syncFence = SyncFence::createAsSignaled(); ExecuteFencedInfoCallback executeFencedInfoCallback = nullptr; Timing timing = {}; if (mDevice->getFeatureLevel() >= kHalVersionV1_3ToApi.featureLevel) { auto result = mPreparedModel->executeFenced(request, waitForHandles, measure, deadline, loopTimeoutDuration, timeoutDurationAfterFence); if (!result.ok()) { LOG(ERROR) << "IPreparedModel::executeFenced() error: " << result.error().message; VLOG(EXECUTION) << "**executeFenced failed**"; return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}}; } std::tie(syncFence, executeFencedInfoCallback) = std::move(result).value(); } else { // Fallback to synchronous execution if executeFenced is not supported. // First wait for all sync fences to be ready. LOG(INFO) << "No drivers able to handle sync fences, falling back to regular execution"; for (const auto& fence : waitForHandles) { if (!fence.hasFd() || fence.getFd() < 0) { return {ANEURALNETWORKS_BAD_DATA, -1, nullptr, {}}; } auto r = fence.syncWait({/* no timeout */}); if (r != SyncFence::FenceState::SIGNALED) { LOG(ERROR) << "syncWait failed, fd: " << fence.getFd() << ", state: " << r; return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}}; } } auto result = mPreparedModel->execute(request, measure, deadline, loopTimeoutDuration); if (!result.ok()) { LOG(ERROR) << "IPreparedModel::execute() error: " << result.error().message; return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}}; } std::tie(std::ignore, timing) = result.value(); } int syncFenceFd = -1; if (syncFence.hasFd()) { syncFenceFd = dup(syncFence.getFd()); if (syncFenceFd < 0) { LOG(ERROR) << "Failed to dup the file descriptor"; return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing}; } } VLOG(EXECUTION) << "DriverPreparedModel::executeFenced completed"; return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedInfoCallback, timing}; } std::pair> DriverPreparedModel::createReusableExecution( const std::vector& inputs, const std::vector& outputs, const std::vector& memories, MeasureTiming measure, const OptionalDuration& loopTimeoutDuration) const { NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::createReusableExecution"); auto request = createDriverRequest(inputs, outputs, memories); auto result = mPreparedModel->createReusableExecution(request, measure, loopTimeoutDuration); if (!result.ok()) { LOG(ERROR) << "IPreparedModel::createReusableExecution() error: " << result.error().message; const int n = convertErrorStatusToResultCode(result.error().code); return {n, nullptr}; } auto execution = std::make_shared( std::move(result).value(), std::move(request), memories, measure, loopTimeoutDuration, mDevice->getFeatureLevel()); return {ANEURALNETWORKS_NO_ERROR, std::move(execution)}; } std::tuple, Timing> DriverExecution::compute( const SharedBurst& burstController, const OptionalTimePoint& deadline) const { NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "DriverExecution::compute"); // compute using burst if present, otherwise compute from IPreparedModel SharedExecution execution; const bool burstCompute = (burstController != nullptr); if (burstCompute) { // create a reusable burst execution if the controller is not seen before auto burstExecution = mCachedBurstExecutions.find(burstController.get()); if (burstExecution == mCachedBurstExecutions.end()) { for (const RuntimeMemory* memory : kMemories) { const auto pool = memory->getMemoryPool(); if (const auto* maybeMemory = std::get_if(&pool)) { auto cacheHold = burstController->cacheMemory(*maybeMemory); memory->hold(cacheHold); } } auto createResult = burstController->createReusableExecution(kRequest, kMeasure, kLoopTimeoutDuration); if (!createResult.ok()) { LOG(ERROR) << "IBurst::createReusableExecution() error: " << createResult.error().message; const int n = convertErrorStatusToResultCode(createResult.error().code); return {n, {}, {}}; } execution = std::move(createResult).value(); mCachedBurstExecutions.emplace(burstController.get(), execution); } else { execution = burstExecution->second; } VLOG(EXECUTION) << "Before mBurstExecution->compute() " << SHOW_IF_DEBUG(kRequest); } else { execution = kExecution; } CHECK(execution != nullptr); auto result = execution->compute(deadline); if (!result.ok()) { auto [message, code, returnedOutputShapes] = std::move(result).error(); int n = convertErrorStatusToResultCode(code); VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")"; LOG(ERROR) << (burstCompute ? "IBurst" : "IPreparedModel") << "::execute(...) error: " << message; if (code == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) { return {n, std::move(returnedOutputShapes), {}}; } return {n, {}, {}}; } VLOG(EXECUTION) << "DriverExecution::compute completed"; auto [outputShapes, timing] = std::move(result).value(); return {ANEURALNETWORKS_NO_ERROR, std::move(outputShapes), timing}; } std::tuple DriverExecution::computeFenced( const std::vector& waitFor, const OptionalTimePoint& deadline, const OptionalDuration& timeoutDurationAfterFence) const { NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "DriverExecution::computeFenced"); CHECK(std::all_of(waitFor.begin(), waitFor.end(), [](int fd) { return fd >= 0; })); std::vector waitForHandles; waitForHandles.reserve(waitFor.size()); for (int fd : waitFor) { int dupFd = dup(fd); if (dupFd < 0) { LOG(ERROR) << "Unable to dup the file descriptor"; return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}}; } waitForHandles.push_back(SyncFence::create(base::unique_fd(dupFd))); } SyncFence syncFence = SyncFence::createAsSignaled(); ExecuteFencedInfoCallback executeFencedInfoCallback = nullptr; Timing timing = {}; if (kDeviceFeatureLevel >= kHalVersionV1_3ToApi.featureLevel) { auto result = kExecution->computeFenced(waitForHandles, deadline, timeoutDurationAfterFence); if (!result.ok()) { LOG(ERROR) << "IExecution::computeFenced() error: " << result.error().message; VLOG(EXECUTION) << "**computeFenced failed**"; return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}}; } std::tie(syncFence, executeFencedInfoCallback) = std::move(result).value(); } else { // Fallback to synchronous execution if computeFenced is not supported. // First wait for all sync fences to be ready. LOG(INFO) << "No drivers able to handle sync fences, falling back to regular execution"; for (const auto& fence : waitForHandles) { if (!fence.hasFd() || fence.getFd() < 0) { return {ANEURALNETWORKS_BAD_DATA, -1, nullptr, {}}; } auto r = fence.syncWait({/* no timeout */}); if (r != SyncFence::FenceState::SIGNALED) { LOG(ERROR) << "syncWait failed, fd: " << fence.getFd() << ", state: " << r; return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}}; } } auto result = kExecution->compute(deadline); if (!result.ok()) { LOG(ERROR) << "IExecution::compute() error: " << result.error().message; return {convertErrorStatusToResultCode(result.error().code), -1, nullptr, {}}; } std::tie(std::ignore, timing) = result.value(); } int syncFenceFd = -1; if (syncFence.hasFd()) { syncFenceFd = dup(syncFence.getFd()); if (syncFenceFd < 0) { LOG(ERROR) << "Failed to dup the file descriptor"; return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, timing}; } } VLOG(EXECUTION) << "DriverExecution::computeFenced completed"; return {ANEURALNETWORKS_NO_ERROR, syncFenceFd, executeFencedInfoCallback, timing}; } static Capabilities createCpuCapabilities() { constexpr Capabilities::PerformanceInfo kPerf = {.execTime = 1.0f, .powerUsage = 1.0f}; constexpr OperandType operandTypes[] = { OperandType::FLOAT32, OperandType::INT32, OperandType::UINT32, OperandType::TENSOR_FLOAT32, OperandType::TENSOR_INT32, OperandType::TENSOR_QUANT8_ASYMM, OperandType::BOOL, OperandType::TENSOR_QUANT16_SYMM, OperandType::TENSOR_FLOAT16, OperandType::TENSOR_BOOL8, OperandType::FLOAT16, OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL, OperandType::TENSOR_QUANT16_ASYMM, OperandType::TENSOR_QUANT8_SYMM, OperandType::TENSOR_QUANT8_ASYMM_SIGNED, }; std::vector operandPerformance; operandPerformance.reserve(std::size(operandTypes)); std::transform(std::begin(operandTypes), std::end(operandTypes), std::back_inserter(operandPerformance), [kPerf](OperandType type) { return Capabilities::OperandPerformance{.type = type, .info = kPerf}; }); auto table = Capabilities::OperandPerformanceTable::create(std::move(operandPerformance)).value(); return Capabilities{ .relaxedFloat32toFloat16PerformanceScalar = kPerf, .relaxedFloat32toFloat16PerformanceTensor = kPerf, .operandPerformance = std::move(table), .ifPerformance = kPerf, .whilePerformance = kPerf, }; } // A special abstracted device for the CPU. Only one instance of this class will exist. // Use get() to retrieve it. class CpuDevice : public Device { public: // Returns the singleton CPU fallback device. static std::shared_ptr get() { static std::shared_ptr instance(new CpuDevice); return instance; } const std::string& getName() const override { return kName; } const std::string& getVersionString() const override { return kVersionString; } int64_t getFeatureLevel() const override { return kFeatureLevel; } int32_t getType() const override { return ANEURALNETWORKS_DEVICE_CPU; } bool isUpdatable() const override { return false; } const std::vector& getSupportedExtensions() const override { return kSupportedExtensions; } std::vector getSupportedOperations(const MetaModel& metaModel) const override; const Capabilities& getCapabilities() const override { return kCapabilities; } Capabilities::PerformanceInfo getPerformance(OperandType) const override { return kPerformance; } Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override { return kPerformance; } Capabilities::PerformanceInfo getRelaxedFloat32toFloat16PerformanceTensor() const override { return kPerformance; } Capabilities::PerformanceInfo getIfPerformance() const override { return kPerformance; } Capabilities::PerformanceInfo getWhilePerformance() const override { return kPerformance; } std::pair getNumberOfCacheFilesNeeded() const override { return {/*numModelCache=*/0, /*numDataCache=*/0}; } bool isCachingSupported() const override { return false; } int wait() const override { return ANEURALNETWORKS_NO_ERROR; } std::pair> prepareModel( const ModelFactory& makeModel, ExecutionPreference preference, Priority priority, const OptionalTimePoint& deadline, const CacheInfo& cacheInfo, const std::optional& maybeToken) const override; std::pair> allocate(const MemoryDescriptor& desc, OperandType type) const override; private: CpuDevice() = default; const int64_t kFeatureLevel = kCurrentNNAPIRuntimeFeatureLevel; const std::string kName = "nnapi-reference"; #ifndef NN_COMPATIBILITY_LIBRARY_BUILD const std::string kVersionString = build::GetBuildNumber(); #else const std::string kVersionString = "UNKNOWN"; #endif // NN_COMPATIBILITY_LIBRARY_BUILD // Since the performance is a ratio compared to the CPU performance, // by definition the performance of the CPU is 1.0. const Capabilities::PerformanceInfo kPerformance = {.execTime = 1.0f, .powerUsage = 1.0f}; const Capabilities kCapabilities = createCpuCapabilities(); const std::vector kSupportedExtensions{/* No extensions. */}; }; // A special abstracted RuntimePreparedModel for the CPU, constructed by CpuDevice. class CpuPreparedModel : public RuntimePreparedModel { public: // Factory method for CpuPreparedModel. Returns ANEURALNETWORKS_NO_ERROR and // a prepared model object if successfully created. Returns an error code // and nullptr otherwise. static std::pair> create(Model model); const Device* getDevice() const override { return CpuDevice::get().get(); } SharedPreparedModel getInterface() const override { return nullptr; } std::tuple, Timing> execute( const std::vector& inputs, const std::vector& outputs, const std::vector& memories, const SharedBurst& burstController, MeasureTiming measure, const OptionalTimePoint& deadline, const OptionalDuration& loopTimeoutDuration) const override; GeneralResult configureExecutionBurst() const override { return nullptr; } std::tuple executeFenced( const std::vector& inputs, const std::vector& outputs, const std::vector& memories, const std::vector& waitFor, MeasureTiming measure, const OptionalTimePoint& deadline, const OptionalDuration& loopTimeoutDuration, const OptionalDuration& timeoutDurationAfterFence) const override; std::pair> createReusableExecution( const std::vector& inputs, const std::vector& outputs, const std::vector& memories, MeasureTiming measure, const OptionalDuration& loopTimeoutDuration) const override; MemoryPreference getMemoryPreference() const override { return {kPreferredAlignment, kPreferredPadding}; } // Prefer to use CpuPreparedModel::create. CpuPreparedModel(Model model, std::vector poolInfos) : mModel(std::move(model)), mModelPoolInfos(std::move(poolInfos)) {} const Model& getModel() const { return mModel; } const std::vector& getModelPoolInfos() const { return mModelPoolInfos; } private: // TFLite kernels prefers 64 bytes for padding and alignment. static constexpr uint32_t kPreferredAlignment = 64; static constexpr uint32_t kPreferredPadding = 64; const Model mModel; const std::vector mModelPoolInfos; }; class CpuExecution : public RuntimeExecution { public: CpuExecution(const CpuPreparedModel& preparedModel, Request request, std::vector requestPoolInfos, OptionalDuration loopTimeoutDuration) : kPreparedModel(preparedModel), kRequest(std::move(request)), kRequestPoolInfos(std::move(requestPoolInfos)), kLoopTimeoutDuration(std::move(loopTimeoutDuration)) {} std::tuple, Timing> compute( const SharedBurst& burstController, const OptionalTimePoint& deadline) const override; std::tuple computeFenced( const std::vector& waitFor, const OptionalTimePoint& deadline, const OptionalDuration& timeoutDurationAfterFence) const override; private: const CpuPreparedModel& kPreparedModel; Request kRequest; std::vector kRequestPoolInfos; const OptionalDuration kLoopTimeoutDuration; }; std::vector CpuDevice::getSupportedOperations(const MetaModel& metaModel) const { const Model& model = metaModel.getModel(); const size_t count = model.main.operations.size(); std::vector result(count, false); for (size_t i = 0; i < count; i++) { // TODO(b/119870033): Decide whether and how post-P operations would be supported on CPU. // We may want to use the slicer for CpuDevice just as we do for // DriverDevice. OperationType operationType = model.main.operations[i].type; result[i] = !isExtension(operationType) && operationType != OperationType::OEM_OPERATION; } return result; } std::pair> CpuDevice::prepareModel( const ModelFactory& makeModel, ExecutionPreference preference, Priority priority, const OptionalTimePoint& deadline, const CacheInfo& /*cacheInfo*/, const std::optional& maybeToken) const { CHECK(!maybeToken.has_value()) << "Should never call prepareModel with cache information on CpuDevice"; const Model model = makeModel(); if (auto result = validate(model); !result.ok()) { LOG(ERROR) << "Invalid Model: " << result.error(); return {ANEURALNETWORKS_OP_FAILED, nullptr}; } if (auto result = validate(preference); !result.ok()) { LOG(ERROR) << "Invalid ExecutionPreference: " << result.error(); return {ANEURALNETWORKS_OP_FAILED, nullptr}; } if (auto result = validate(priority); !result.ok()) { LOG(ERROR) << "Invalid Priority: " << result.error(); return {ANEURALNETWORKS_OP_FAILED, nullptr}; } if (hasDeadlinePassed(deadline)) { return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, nullptr}; } return CpuPreparedModel::create(model); } std::pair> CpuDevice::allocate(const MemoryDescriptor& desc, OperandType type) const { uint32_t size = TypeManager::get()->getSizeOfData(type, desc.dimensions); if (size == 0) { LOG(ERROR) << "CpuDevice::allocate -- does not support unknown dimensions."; return {ANEURALNETWORKS_OP_FAILED, nullptr}; } return MemoryAshmem::create(size); } std::pair> CpuPreparedModel::create(Model model) { std::vector poolInfos; if (!setRunTimePoolInfosFromCanonicalMemories(&poolInfos, model.pools)) { return {ANEURALNETWORKS_UNMAPPABLE, nullptr}; } std::shared_ptr preparedModel = std::make_shared(std::move(model), std::move(poolInfos)); return {ANEURALNETWORKS_NO_ERROR, std::move(preparedModel)}; } static std::tuple, Timing> computeOnCpu( const Model& model, const Request& request, const std::vector& modelPoolInfos, const std::vector& requestPoolInfos, const OptionalTimePoint& deadline, const OptionalDuration& loopTimeoutDuration) { NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu"); CpuExecutor executor; if (loopTimeoutDuration.has_value()) { executor.setLoopTimeout(loopTimeoutDuration->count()); } if (deadline.has_value()) { executor.setDeadline(*deadline); } int err = executor.run(model, request, modelPoolInfos, requestPoolInfos); const auto& outputShapes = executor.getOutputShapes(); return {err, outputShapes, {}}; } std::tuple CpuPreparedModel::executeFenced( const std::vector& inputs, const std::vector& outputs, const std::vector& memories, const std::vector& waitFor, MeasureTiming measure, const OptionalTimePoint& deadline, const OptionalDuration& loopTimeoutDuration, const OptionalDuration& duration) const { VLOG(EXECUTION) << "CpuPreparedModel::executeFenced wait for sync fences to signal before execution"; for (int syncFd : waitFor) { if (syncFd > 0) { auto r = syncWait(syncFd, -1); if (r != FenceState::SIGNALED) { LOG(ERROR) << "sync wait failed, fd: " << syncFd; return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}}; } } } // Update deadline if the timeout duration is closer than the deadline. auto closestDeadline = deadline; if (duration.has_value()) { const auto timeoutDurationDeadline = makeDeadline(*duration); if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) { closestDeadline = timeoutDurationDeadline; } } const auto [result, outputShapes, timing] = execute(inputs, outputs, memories, nullptr, measure, closestDeadline, loopTimeoutDuration); return {result, -1, nullptr, timing}; } static std::tuple> createCpuRequest( const std::vector& inputs, const std::vector& outputs, const std::vector& memories) { std::vector requestPoolInfos; requestPoolInfos.reserve(memories.size()); for (const RuntimeMemory* mem : memories) { if (std::optional poolInfo = mem->getRunTimePoolInfo()) { requestPoolInfos.emplace_back(*poolInfo); } else { return {ANEURALNETWORKS_UNMAPPABLE, {}, {}}; } } // Create as many pools as there are input / output. auto fixPointerArguments = [&requestPoolInfos](const std::vector& argumentInfos) { std::vector ptrArgsLocations; for (const ModelArgumentInfo& argumentInfo : argumentInfos) { if (argumentInfo.state() == ModelArgumentInfo::POINTER) { ptrArgsLocations.push_back( {.poolIndex = static_cast(requestPoolInfos.size()), .offset = 0, .length = argumentInfo.length(), .padding = argumentInfo.padding()}); requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer( static_cast(argumentInfo.buffer()))); } } return ptrArgsLocations; }; const std::vector inputPtrArgsLocations = fixPointerArguments(inputs); const std::vector outputPtrArgsLocations = fixPointerArguments(outputs); Request request; request.inputs = createRequestArguments(inputs, inputPtrArgsLocations); request.outputs = createRequestArguments(outputs, outputPtrArgsLocations); return {ANEURALNETWORKS_NO_ERROR, std::move(request), std::move(requestPoolInfos)}; } // Perform computation on NNAPI CPU reference implementation. // // Contrary to DriverPreparedModel::execute, the NNAPI CPU reference executor lives in the // same process as the NNAPI runtime and can take raw pointers. We will create as many pools as // there are input/output in this method to avoid data copying. // // Will choose between sync/async execution according to DeviceManager::mSyncExecCpu. std::tuple, Timing> CpuPreparedModel::execute( const std::vector& inputs, const std::vector& outputs, const std::vector& memories, const SharedBurst& /*burstController*/, MeasureTiming /*measure*/, const OptionalTimePoint& deadline, const OptionalDuration& loopTimeoutDuration) const { if (hasDeadlinePassed(deadline)) { return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, {}}; } int nCreateRequest; Request request; std::vector requestPoolInfos; std::tie(nCreateRequest, request, requestPoolInfos) = createCpuRequest(inputs, outputs, memories); if (nCreateRequest != ANEURALNETWORKS_NO_ERROR) { return {nCreateRequest, {}, {}}; } if (!DeviceManager::get()->syncExecCpu()) { // TODO: use a thread pool // TODO(mikie): this could have NNTRACE so we could measure the overhead // of spinning up a new thread. std::tuple, Timing> result = {}; std::thread([this, &request, &requestPoolInfos, &deadline, &loopTimeoutDuration, &result] { result = computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline, loopTimeoutDuration); }).join(); return result; } return computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, deadline, loopTimeoutDuration); } std::pair> CpuPreparedModel::createReusableExecution( const std::vector& inputs, const std::vector& outputs, const std::vector& memories, MeasureTiming /*measure*/, const OptionalDuration& loopTimeoutDuration) const { auto [nCreateRequest, request, requestPoolInfos] = createCpuRequest(inputs, outputs, memories); if (nCreateRequest != ANEURALNETWORKS_NO_ERROR) { return {nCreateRequest, nullptr}; } auto execution = std::make_shared( *this, std::move(request), std::move(requestPoolInfos), loopTimeoutDuration); return {ANEURALNETWORKS_NO_ERROR, std::move(execution)}; } std::tuple, Timing> CpuExecution::compute( const SharedBurst& /*burstController*/, const OptionalTimePoint& deadline) const { if (hasDeadlinePassed(deadline)) { return {ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT, {}, {}}; } if (!DeviceManager::get()->syncExecCpu()) { // TODO: use a thread pool // TODO(mikie): this could have NNTRACE so we could measure the overhead // of spinning up a new thread. std::tuple, Timing> result = {}; std::thread([this, &deadline, &result] { result = computeOnCpu(kPreparedModel.getModel(), kRequest, kPreparedModel.getModelPoolInfos(), kRequestPoolInfos, deadline, kLoopTimeoutDuration); }).join(); return result; } return computeOnCpu(kPreparedModel.getModel(), kRequest, kPreparedModel.getModelPoolInfos(), kRequestPoolInfos, deadline, kLoopTimeoutDuration); } std::tuple CpuExecution::computeFenced( const std::vector& waitFor, const OptionalTimePoint& deadline, const OptionalDuration& duration) const { VLOG(EXECUTION) << "CpuExecution::computeFenced wait for sync fences to signal before execution"; for (int syncFd : waitFor) { if (syncFd > 0) { auto r = syncWait(syncFd, -1); if (r != FenceState::SIGNALED) { LOG(ERROR) << "sync wait failed, fd: " << syncFd; return {ANEURALNETWORKS_OP_FAILED, -1, nullptr, {}}; } } } // Update deadline if the timeout duration is closer than the deadline. auto closestDeadline = deadline; if (duration.has_value()) { const auto timeoutDurationDeadline = makeDeadline(*duration); if (!closestDeadline.has_value() || *closestDeadline > timeoutDurationDeadline) { closestDeadline = timeoutDurationDeadline; } } const auto [result, outputShapes, timing] = compute(nullptr, closestDeadline); return {result, -1, nullptr, timing}; } DeviceManager* DeviceManager::get() { static DeviceManager manager; return &manager; } std::shared_ptr DeviceManager::getCpuDevice() { return CpuDevice::get(); } std::shared_ptr DeviceManager::forTest_makeDriverDevice(const SharedDevice& device) { VLOG(MANAGER) << "forTest_makeDriverDevice(" << device->getName() << ")"; const auto driverDevice = DriverDevice::create(device); CHECK(driverDevice != nullptr); return driverDevice; } #ifndef NN_COMPATIBILITY_LIBRARY_BUILD std::vector> getDriverDevices() { const auto& appInfo = AppInfoFetcher::get()->getAppInfo(); const bool currentProcessIsOnThePlatform = appInfo.appIsSystemApp || appInfo.appIsOnVendorImage || appInfo.appIsOnProductImage; const bool includeUpdatableDrivers = !currentProcessIsOnThePlatform; auto devicesAndUpdatability = hardware::neuralnetworks::service::getDevices(includeUpdatableDrivers); std::vector> driverDevices; driverDevices.reserve(devicesAndUpdatability.size()); for (auto& [device, isDeviceUpdatable] : devicesAndUpdatability) { driverDevices.push_back(DriverDevice::create(std::move(device), isDeviceUpdatable)); } return driverDevices; } #else std::vector> getDriverDevices() { auto devices = getDevices(); std::vector> driverDevices; driverDevices.reserve(devices.size()); for (auto& device : devices) { driverDevices.push_back(DriverDevice::create(std::move(device))); } return driverDevices; } #endif // NN_COMPATIBILITY_LIBRARY_BUILD void DeviceManager::findAvailableDevices() { VLOG(MANAGER) << "findAvailableDevices"; // register driver devices auto driverDevices = getDriverDevices(); for (auto& driverDevice : driverDevices) { VLOG(MANAGER) << "Found interface " << driverDevice->getName(); mDevices.push_back(std::move(driverDevice)); } #ifndef NN_COMPATIBILITY_LIBRARY_BUILD // register CPU fallback device mDevices.push_back(CpuDevice::get()); mDevicesCpuOnly.push_back(CpuDevice::get()); #endif // NN_COMPATIBILITY_LIBRARY_BUILD } void DeviceManager::registerDevice(const SharedDevice& device) { if (auto driverDevice = DriverDevice::create(device)) { mDevices.push_back(std::move(driverDevice)); } } DeviceManager::DeviceManager() { VLOG(MANAGER) << "DeviceManager::DeviceManager"; findAvailableDevices(); #ifdef NN_DEBUGGABLE mStrictSlicing = (getProp("debug.nn.strict-slicing") != 0); mPartitioning = getProp("debug.nn.partition", kPartitioningDefault); mDebugNNCpuOnly = (getProp("debug.nn.cpuonly") != 0); mSyncExecCpu = (getProp("debug.nn.syncexec-cpu", 1) != 0); mSyncExecRuntime = (getProp("debug.nn.syncexec-runtime") != 0); #endif // NN_DEBUGGABLE } } // namespace nn } // namespace android