/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #define LOG_TAG "ExecutionPlan" #include "ExecutionPlan.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "BurstBuilder.h" #include "CompilationBuilder.h" #include "ExecutionBuilder.h" #include "ExecutionCallback.h" #include "Manager.h" #include "ModelBuilder.h" #include "TypeManager.h" namespace android { namespace nn { namespace { // The index of the main model in SourceModels. constexpr uint32_t kMainModelInSourceModels = 0; constexpr uint32_t kNoPadding = 1; // Compiles the model on device. // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the // device name, device version string, and the execution preference in this function. int compile(const Device& device, const ModelBuilder& model, int executionPreference, int compilationPriority, const OptionalTimePoint& deadline, const CacheInfo& cacheInfo, TokenHasher* token, std::shared_ptr* preparedModel) { CHECK(token != nullptr); CHECK(preparedModel != nullptr); *preparedModel = nullptr; std::optional cacheToken; if (device.isCachingSupported() && token->ok() && token->updateFromString(device.getName().c_str()) && token->updateFromString(device.getVersionString().c_str()) && token->update(&executionPreference, sizeof(executionPreference)) && token->update(&compilationPriority, sizeof(compilationPriority)) && token->finish()) { cacheToken = CacheToken{}; const uint8_t* tokenPtr = token->getCacheToken(); std::copy(tokenPtr, tokenPtr + cacheToken->size(), cacheToken->begin()); } const ModelFactory makeModel = [&model] { return model.makeModel(); }; const ExecutionPreference preference = static_cast(executionPreference); const Priority priority = convertToCanonicalPriority(compilationPriority); const auto [n, returnedPreparedModel] = device.prepareModel(makeModel, preference, priority, deadline, cacheInfo, cacheToken); *preparedModel = returnedPreparedModel; return n; } typedef std::function OperationReadyCallback; int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex, const Operand& fromOperand) { if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL && std::holds_alternative(fromOperand.extraParams)) { auto& fromChannelQuant = std::get(fromOperand.extraParams); ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = { .channelDim = fromChannelQuant.channelDim, .scaleCount = static_cast(fromChannelQuant.scales.size()), .scales = fromChannelQuant.scales.data(), }; return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant); } else if (isExtension(fromOperand.type) && std::holds_alternative(fromOperand.extraParams)) { auto extensionData = std::get(fromOperand.extraParams); return model.setOperandExtensionData(toOperandIndex, extensionData.data(), extensionData.size()); } else if (!std::holds_alternative(fromOperand.extraParams) || fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) { LOG(ERROR) << "Type " << fromOperand.type << " has an unexpected extraParams variant: " << fromOperand.extraParams.index(); return ANEURALNETWORKS_BAD_DATA; } else { return ANEURALNETWORKS_NO_ERROR; } } // This class tracks whether we know the value of an operand as operations // are processed. class OperandTracker { public: // Creates the tracker for this model. Figure out which operations can be // executed right away and cb for each one of them. OperandTracker(const ModelBuilder* model, OperationReadyCallback cb); // Mark the specified operation as having been processed. The output // of the operation now being known, this may make new operations to be // able to run. Call cb for each one of them. void markProcessed(uint32_t operationIndex, OperationReadyCallback cb); private: const ModelBuilder* mModel; std::multimap mOperandToOperations; std::vector mUnknownInputCount; // For each operation }; OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) : mModel(model) { const auto& operations = mModel->getOperations(); mUnknownInputCount.resize(operations.size()); for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) { const Operation& operation = operations[operationIndex]; uint32_t count = 0; for (uint32_t operandIndex : operation.inputs) { auto lifetime = mModel->getOperand(operandIndex).lifetime; if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE || lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) { count++; mOperandToOperations.emplace(operandIndex, operationIndex); } } if (count == 0) { cb(operationIndex); } mUnknownInputCount[operationIndex] = count; } } void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) { // Mark all its outputs as known. const Operation& operation = mModel->getOperations()[operationIndex]; for (uint32_t operandIndex : operation.outputs) { auto range = mOperandToOperations.equal_range(operandIndex); for (auto i = range.first; i != range.second; i++) { uint32_t& count = mUnknownInputCount[i->second]; if (--count == 0) { cb(i->second); } } } } StaticTemporaryLocation addTemporary(uint32_t* totalSizeOfTemporaries, uint32_t size, uint32_t alignment, uint32_t padding) { // TODO: what about overflow? *totalSizeOfTemporaries = roundUp(*totalSizeOfTemporaries, alignment); const uint32_t offset = *totalSizeOfTemporaries; size = roundUp(size, padding); *totalSizeOfTemporaries += size; return {.offset = offset, .paddedLength = size}; }; std::string toString(SourceOperandIndex sourceOperandIndex) { return "(" + std::to_string(sourceOperandIndex.first) + ", " + std::to_string(sourceOperandIndex.second) + ")"; }; // A helper class to analyze the step roles of all partition boundary operands. // // To use, call StepRoleAnalyzer::analyze and pass in a setup function that configures the analyzer // with the following two methods: // - addRole: Add a step role to a boundary operand // - setUsedBy: Specify that the memory of the "source" operand may be directly used by the "dest" // operand. All of the step roles of the "dest" operand are also possible step roles of the // "source" operand. This is useful for interpreted control flow, e.g., the outer input operand // of an interpreted IF operation may be directly used as all step roles of the corresponding // input operand of the then and else models. Note that this relationship is directional -- // (A->B && B->C) implies A->C, but (A->C && B->C) does not imply A->B or B->A (A->B is a // shorthand for setUsedBy(A, B)). The setup function must guarantee that the final graph // produced by the used-by relationship is acyclic. This is true for the partitioner algorithm // because there must be a root operand of each step role for the memory to be allocated on // behalf of. // class StepRoleAnalyzer { public: static std::map> analyze( const std::function& setup) { StepRoleAnalyzer analyzer; setup(analyzer); return analyzer.finish(); } void addRole(const ExecutionStep& step, uint32_t operandIndex, IOType type, uint32_t stepIOIndex) { SourceOperandIndex source = {step.getSourceModelIndex(), operandIndex}; mRoles[source].emplace(step.getIndex(), type, stepIOIndex); } void setUsedBy(const SourceOperandIndex& source, const SourceOperandIndex& dest) { mUsedBy[source].emplace(dest); } private: StepRoleAnalyzer() = default; // Merges the step roles of the destination operands to the source operands // and returns the final map. std::map> finish() { for (const auto& [source, _] : mUsedBy) { finishHelper(source); } return std::move(mRoles); } void finishHelper(SourceOperandIndex current) { if (mProcessedOperands.count(current) > 0) return; mProcessedOperands.insert(current); const auto it = mUsedBy.find(current); if (it != mUsedBy.end()) { auto& roles = mRoles[current]; // Merge the step roles of the destination operands. for (const auto& dest : it->second) { finishHelper(dest); const auto& destRoles = mRoles[dest]; roles.insert(destRoles.begin(), destRoles.end()); } } } // A map from the source operand to its step roles. std::map> mRoles; // A map from the source operand to a set of destination operands that may directly // use the memory of the source operand. std::map> mUsedBy; // Used in finish to track which operand has been processed. std::set mProcessedOperands; }; } // namespace void DynamicTemporaries::vlogDump(const char* context) const { if (empty()) { return; } if (context) { VLOG(EXECUTION) << "DynamicTemporaries: \"" << context << "\""; } for (const auto& temp : mSourceOperandToTemporary) { VLOG(EXECUTION) << "DynamicTemporaries: sourceOperandIndex = " << toString(temp.first) << ", stepIndex = " << temp.second.stepIndex << ", offset = " << temp.second.offset << ", dimensions = " << toString(temp.second.dimensions) << ", paddedLength = " << temp.second.paddedLength << ", alignment = " << temp.second.alignment << ", padding = " << temp.second.padding; } } void DynamicTemporaries::declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex, const Dimensions& initialDimensions, uint32_t initialLength, uint32_t alignment, uint32_t padding) { VLOG(EXECUTION) << "DynamicTemporaries::declare(sourceOperandIndex = " << toString(sourceOperandIndex) << ", stepIndex = " << stepIndex << ", initialDimensions = " << toString(initialDimensions) << ", initialLength = " << initialLength << ", alignment = " << alignment << ", padding = " << padding << ")"; CHECK(!mDeclared); CHECK_GT(initialLength, 0u); const uint32_t paddedLength = roundUp(initialLength, padding); auto [_, isNew] = mSourceOperandToTemporary.emplace( sourceOperandIndex, InternalLocationAndShape{stepIndex, 0, initialDimensions, paddedLength, alignment, padding}); CHECK(isNew); mStepIndexToSourceOperandIndexes[stepIndex].emplace_back(sourceOperandIndex); } bool DynamicTemporaries::redeclare(SourceOperandIndex sourceOperandIndex, const Dimensions& newDimensions, uint32_t newLength) { auto createAndLogResult = [sourceOperandIndex, &newDimensions, newLength](bool changedShape) { VLOG(EXECUTION) << "DynamicTemporaries::redeclare(sourceOperandIndex = " << toString(sourceOperandIndex) << ", newDimensions = " << toString(newDimensions) << ", newLength = " << newLength << ") -> " << toString(changedShape); return changedShape; }; CHECK(mDeclared); CHECK_GT(newLength, 0u); InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex); const uint32_t paddedLength = roundUp(newLength, temp.padding); if (temp.paddedLength == paddedLength && temp.dimensions == newDimensions) { return createAndLogResult(false); } if (temp.paddedLength < paddedLength) { // Otherwise allocation remains valid, even if it may be suboptimal // (because it uses more space than needed). Use case: Don't force // client to allocate again just because the client reported more // accurate shape information. mAllocatedStepIndexes.erase(temp.stepIndex); } temp.paddedLength = paddedLength; temp.dimensions = newDimensions; return createAndLogResult(true); } int DynamicTemporaries::allocate(uint32_t stepIndex) { VLOG(EXECUTION) << "DynamicTemporaries::allocate(stepIndex = " << stepIndex << ")"; CHECK(mDeclared); const auto sourceOperandIndexesI = mStepIndexToSourceOperandIndexes.find(stepIndex); if (sourceOperandIndexesI == mStepIndexToSourceOperandIndexes.end()) { return ANEURALNETWORKS_NO_ERROR; } // perform layout uint32_t newSize = 0; for (const auto& sourceOperandIndex : sourceOperandIndexesI->second) { InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex); // temp.paddedLength is already padded in declare and redeclare. CHECK(temp.paddedLength % temp.padding == 0); temp.offset = addTemporary(&newSize, temp.paddedLength, temp.alignment, kNoPadding).offset; } // perform (re-)allocation // TODO: Today we may shrink the allocation in order to avoid wasting memory. Is this important // to conserve memory, or do we waste time reallocating? const double kWaste = 0.2 /* arbitrary */; // Willing to waste space to avoid // deallocation/reallocation overhead auto& memory = mStepIndexToMemory[stepIndex]; const uint32_t oldSize = (memory ? memory->getSize() : 0); if ((oldSize >= newSize) && (oldSize <= newSize * (1 + kWaste))) { // Suitable allocation already exists; nothing to do } else { int n; std::tie(n, memory) = MemoryAshmem::create(newSize); if (n != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "Failed to allocate dynamic temporaries of size " << newSize << " for step " << stepIndex; mAllocatedStepIndexes.erase(stepIndex); return n; } } mAllocatedStepIndexes.insert(stepIndex); return ANEURALNETWORKS_NO_ERROR; } bool DynamicTemporaries::allocated(uint32_t stepIndex) const { return (mStepIndexToSourceOperandIndexes.find(stepIndex) == mStepIndexToSourceOperandIndexes.end()) || mAllocatedStepIndexes.count(stepIndex); } std::optional DynamicTemporaries::lookup( SourceOperandIndex sourceOperandIndex, bool mustBeAllocated) const { CHECK(mDeclared); if (auto it = mSourceOperandToTemporary.find(sourceOperandIndex); it != mSourceOperandToTemporary.end()) { const InternalLocationAndShape& temp = it->second; const bool isAllocated = allocated(temp.stepIndex); if (mustBeAllocated) { CHECK(isAllocated) << "Source operand " << toString(sourceOperandIndex) << " must be allocated"; } if (isAllocated) { return LocationAndShape{mStepIndexToMemory.at(temp.stepIndex).get(), temp.offset, &temp.dimensions, temp.paddedLength}; } else { return LocationAndShape{nullptr, ~uint32_t(0), &temp.dimensions, temp.paddedLength}; } } return std::nullopt; } ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex, std::shared_ptr device) : mPlan(plan), mIndex(stepIndex), mSourceModelIndex(sourceModelIndex), mStepModel(), mDevice(device), mToken(plan->getCacheToken()) {} // Adds an operand if it has not been added already. // Sets the index in the step model for the corresponding operand. int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperandIndex, OperandKind kind) { // Have we added this operand already? auto i = mOperandMap.find(sourceOperandIndex); if (i != mOperandMap.end()) { CHECK(kind == INPUT); *stepOperandIndex = i->second; return ANEURALNETWORKS_NO_ERROR; } // First time we add this operand. *stepOperandIndex = mStepModel.operandCount(); mOperandMap.emplace(sourceOperandIndex, *stepOperandIndex); // Add the operand to the step model. const ModelBuilder& sourceModel = *getSourceModel(); const Operand& operand = sourceModel.getOperand(sourceOperandIndex); ANeuralNetworksOperandType type = { .type = static_cast(operand.type), .dimensionCount = static_cast(operand.dimensions.size()), .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr, .scale = operand.scale, .zeroPoint = operand.zeroPoint, }; int n = mStepModel.addOperand(type); if (n != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "Previous error occurred when partitioning the graph"; return n; } n = copyOperandExtraParams(mStepModel, *stepOperandIndex, operand); if (n != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "Error when copying extra parameters to the operand"; return n; } // Sets its value. switch (operand.lifetime) { case Operand::LifeTime::CONSTANT_COPY: { const uint8_t* data = sourceModel.getPointerToOperandValue(operand.location.offset); n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length); } break; case Operand::LifeTime::CONSTANT_REFERENCE: { const RuntimeMemory* memory = sourceModel.getMemories()[operand.location.poolIndex]; n = mStepModel.setOperandValueFromMemory( *stepOperandIndex, memory, operand.location.offset, operand.location.length); } break; case Operand::LifeTime::NO_VALUE: { n = mStepModel.setOperandValue(*stepOperandIndex, nullptr, 0); } break; case Operand::LifeTime::TEMPORARY_VARIABLE: { // handled similarly to SUBGRAPH_OUTPUT if (kind == INPUT) { // The first time we've seen this operand is as an // input. That means it must be defined by a // different partition, and is an input to this one. mTempsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex); } else { // The first time we've seen this operand is as an // output. It may be an input to a different // partition, so keep track of it. mPlan->recordTemporaryDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex), mIndex); } } break; case Operand::LifeTime::SUBGRAPH_INPUT: { mModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex); } break; case Operand::LifeTime::SUBGRAPH_OUTPUT: { // handled similarly to TEMPORARY_VARIABLE if (kind == INPUT) { // The first time we've seen this operand is as an // input. That means it must be defined by a // different partition, and is an input to this one. mOutputsAsStepModelInputs.emplace_back(sourceOperandIndex, *stepOperandIndex); } else { // The first time we've seen this operand is as an // output. mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex); // It may be an input to a different partition, so keep track of // it. mPlan->recordOutputDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex), mIndex); } } break; case Operand::LifeTime::SUBGRAPH: { const ModelBuilder* model = sourceModel.getReferencedModel(operand); n = mStepModel.setOperandValueFromModel(*stepOperandIndex, model); } break; case Operand::LifeTime::POINTER: { const void* data = std::get(operand.location.pointer); n = mStepModel.setOperandValue(*stepOperandIndex, data, operand.location.length); } break; } if (n != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "Previous error occurred when partitioning the graph"; } return n; } int ExecutionStep::addOperation(int operationIndex) { const Operation& operation = getSourceModel()->getOperation(operationIndex); if (mToken.ok()) { mToken.update(&mSourceModelIndex, sizeof(mSourceModelIndex)); mToken.update(&operationIndex, sizeof(operationIndex)); } // Convert the input and output operand indexes. // // We expect operations to be added in topological order. Therefore: // // - We may not have seen an input if it is a model input, a // constant, or an operand written by a different partition. // // - We should not have seen any outputs. auto addOperands = [this](const std::vector& sourceModelOperands, std::vector* stepModelOperands, OperandKind kind) -> int { const uint32_t operandCount = static_cast(sourceModelOperands.size()); for (uint32_t i = 0; i < operandCount; i++) { NN_RETURN_IF_ERROR(addOperand(sourceModelOperands[i], &stepModelOperands->at(i), kind)); } return ANEURALNETWORKS_NO_ERROR; }; const uint32_t inputCount = static_cast(operation.inputs.size()); const uint32_t outputCount = static_cast(operation.outputs.size()); std::vector inputs(inputCount); std::vector outputs(outputCount); NN_RETURN_IF_ERROR(addOperands(operation.inputs, &inputs, INPUT)); NN_RETURN_IF_ERROR(addOperands(operation.outputs, &outputs, OUTPUT)); return mStepModel.addOperation(static_cast(operation.type), inputCount, inputs.data(), outputCount, outputs.data()); } void ExecutionStep::mapInputsAndOutputs( std::shared_ptr executor, const std::vector* mainModelOutputShapes, const RuntimeMemory* temporaryMemory, const std::map& sourceOperandToLocationOfTemporary, const DynamicTemporaries& dynamicTemporaries, const std::map& sourceOperandToInputIndex, const std::map& sourceOperandToOutputIndex, const std::map& sourceOperandToConstantReference) const { auto mapInput = [&](uint32_t stepModelOperandIndex, uint32_t stepInputIndex) { SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex); if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex); it != sourceOperandToLocationOfTemporary.end()) { const auto& loc = it->second; executor->setInputFromMemory(stepInputIndex, temporaryMemory, loc.offset, loc.paddedLength); } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) { executor->setInputFromMemory(stepInputIndex, loc->memory, loc->offset, loc->paddedLength, *loc->dimensions); } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex); it != sourceOperandToInputIndex.end()) { executor->mapInput(it->second, stepInputIndex); } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex); it != sourceOperandToOutputIndex.end()) { executor->mapOutputToInput(it->second, stepInputIndex, mainModelOutputShapes ? &mainModelOutputShapes->at(it->second).dimensions : nullptr); } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex); it != sourceOperandToConstantReference.end()) { // Constant partition boundary operand. This could be an IF branch // model input or a WHILE variable initializer. const auto& loc = it->second; executor->setInputFromMemory(stepInputIndex, loc.memory, loc.offset, loc.length); } else { CHECK(false) << "Cannot map step input " << stepInputIndex << " from operand " << toString(sourceOperandIndex); } }; auto mapOutput = [&](uint32_t stepModelOperandIndex, uint32_t stepOutputIndex) { SourceOperandIndex sourceOperandIndex(mSourceModelIndex, stepModelOperandIndex); if (auto it = sourceOperandToLocationOfTemporary.find(sourceOperandIndex); it != sourceOperandToLocationOfTemporary.end()) { const auto& loc = it->second; executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, loc.offset, loc.paddedLength); } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) { executor->setOutputFromMemory(stepOutputIndex, loc->memory, loc->offset, loc->paddedLength, *loc->dimensions); } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex); it != sourceOperandToOutputIndex.end()) { executor->mapOutput(it->second, stepOutputIndex); } else { CHECK(false) << "Cannot map step output " << stepOutputIndex << " from operand " << toString(sourceOperandIndex); } }; for (uint32_t i = 0, n = mStepModelInputs.size(); i < n; ++i) { mapInput(mStepModelInputs[i].first, i); } for (uint32_t i = 0, n = mStepModelOutputs.size(); i < n; ++i) { mapOutput(mStepModelOutputs[i].first, i); } } void ExecutionPlan::CompoundBody::findModelOutputsThatAreDownstreamInputs() { auto declareModelOutputIsDownstreamInput = [this](const SourceOperandIndex& sourceOperandIndex) { const auto it = mOutputToDefiningExecutionStep.find(sourceOperandIndex); CHECK(it != mOutputToDefiningExecutionStep.end()); uint32_t stepIndex = it->second; CHECK_LT(stepIndex, mSteps.size()); VLOG(COMPILATION) << "ExecutionStep(" << stepIndex << ")->declareModelOutputIsDownstreamInput(mSourceOperandToOutputIndex.at" << toString(sourceOperandIndex) << ")"; CHECK(mSourceOperandToOutputIndex.find(sourceOperandIndex) != mSourceOperandToOutputIndex.end()); mSteps[stepIndex]->executionStep()->declareModelOutputIsDownstreamInput( mSourceOperandToOutputIndex.at(sourceOperandIndex)); }; for (const auto& logicalStep : mSteps) { if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { for (const auto& output : step->getOutputsAsStepModelInputs()) { SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), output.first); declareModelOutputIsDownstreamInput(sourceOperandIndex); } } } } void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() { auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) { const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex); if (it == mTemporaryToDefiningExecutionStep.end()) { // The operand is not a temporary or is not defined by an // ExecutionStep (i.e. it's an output of an IF or a WHILE). // The latter case is handled by ExecutionPlan::makeController(). return; } uint32_t stepIndex = it->second; CHECK_LT(stepIndex, mSteps.size()); mSteps[stepIndex]->executionStep()->recordTempAsStepModelOutput(sourceOperandIndex.second); }; for (const auto& logicalStep : mSteps) { if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { for (const auto& input : step->getTempsAsStepModelInputs()) { SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), input.first); recordAsOutputIfTemporary(sourceOperandIndex); } } else if (const IfStep* step = logicalStep->tryIfStep()) { recordAsOutputIfTemporary(step->conditionOperandIndex); for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) { recordAsOutputIfTemporary(sourceOperandIndex); } } else if (const WhileStep* step = logicalStep->tryWhileStep()) { for (const SourceOperandIndex& sourceOperandIndex : step->outerInputOperands) { recordAsOutputIfTemporary(sourceOperandIndex); } } else { CHECK(logicalStep->isGoto()); } } } void ExecutionStep::declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex) { VLOG(COMPILATION) << "ExecutionStep(" << mIndex << ")::declareModelOutputIsDownstreamInput(" << mainModelOutputIndex << ")"; const auto it = std::find(mOutputIndexStepModelToMainModel.begin(), mOutputIndexStepModelToMainModel.end(), mainModelOutputIndex); CHECK(it != mOutputIndexStepModelToMainModel.end()); const uint32_t stepModelOutputIndex = it - mOutputIndexStepModelToMainModel.begin(); CHECK(stepModelOutputIndex < mModelOutputs.size()); mModelOutputsThatAreDownstreamInputs.insert(stepModelOutputIndex); } void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) { const auto it = mOperandMap.find(stepOperandIndex); CHECK(it != mOperandMap.end()); mTempsAsStepModelOutputs.emplace(stepOperandIndex, it->second); } const ModelBuilder* ExecutionStep::getSourceModel() const { return mPlan->getSourceModels().getModel(mSourceModelIndex); } void ExecutionStep::logStepModel() const { VLOG(COMPILATION) << "ExecutionStep::finishStepModel, step " << mIndex; auto logRemapEntry = [](std::string& toLog, const std::pair& e) { if (!toLog.empty()) { toLog += ", "; } toLog += toString(e.first); toLog += "->"; toLog += toString(e.second); }; auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) { std::string toLog; for (const auto& e : map) { logRemapEntry(toLog, e); } VLOG(COMPILATION) << name << ": " << toLog; }; auto logRemapSet = [&logRemapEntry](const char* name, const StepModelOutputSetType& set) { std::string toLog; for (const auto& e : set) { logRemapEntry(toLog, e); } VLOG(COMPILATION) << name << ": " << toLog; }; logRemapVector("step model inputs", mStepModelInputs); logRemapVector("step model outputs", mStepModelOutputs); logRemapVector("model inputs", mModelInputs); logRemapVector("model outputs", mModelOutputs); logRemapVector("temps as step model inputs", mTempsAsStepModelInputs); logRemapSet("temps as step model outputs", mTempsAsStepModelOutputs); logRemapVector("outputs as step model inputs", mOutputsAsStepModelInputs); } static bool hasUnknownSize(const Operand& operand) { if (operand.dimensions.empty()) { return TypeManager::get()->isTensorType(operand.type); } for (const Dimension& dimension : operand.dimensions) { if (dimension == 0) { return true; } } return false; } int ExecutionStep::finishStepModel(const ModelBuilder* mainModel, bool* hasOutputOfUnknownSize, int32_t executionPreference, int32_t priority) { CHECK(mDevice != nullptr); for (const auto& stepModelOutput : mTempsAsStepModelOutputs) { const Operand& operand = mStepModel.getOperand(stepModelOutput.second); if (hasUnknownSize(operand)) { *hasOutputOfUnknownSize = true; VLOG(COMPILATION) << "StepModelOutput (operand#" << stepModelOutput.first << " of source graph) has unknown size: " << operand; } } mStepModel.relaxComputationFloat32toFloat16(mainModel->isComputationFloat32RelaxedToFloat16()); mStepModelInputs.insert(mStepModelInputs.end(), mModelInputs.begin(), mModelInputs.end()); mStepModelInputs.insert(mStepModelInputs.end(), mTempsAsStepModelInputs.begin(), mTempsAsStepModelInputs.end()); mStepModelInputs.insert(mStepModelInputs.end(), mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end()); mStepModelOutputs.insert(mStepModelOutputs.end(), mModelOutputs.begin(), mModelOutputs.end()); mStepModelOutputs.insert(mStepModelOutputs.end(), mTempsAsStepModelOutputs.begin(), mTempsAsStepModelOutputs.end()); // A step model with no inputs or no outputs is an invalid model. Note that we would like to // attempt full CPU fallback if allowed, so we return OP_FAILED here rather than BAD_DATA from // model validation. if (hasNoInputsOrNoOutputs()) { VLOG(COMPILATION) << "ExecutionStep::finishStepModel: finishing step model with no inputs " "or no outputs"; return ANEURALNETWORKS_OP_FAILED; } if (mSourceModelIndex == kMainModelInSourceModels) { std::map mainModelOperandToInputIndex; for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) { mainModelOperandToInputIndex[mainModel->getInputOperandIndex(i)] = i; } std::map mainModelOperandToOutputIndex; for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) { mainModelOperandToOutputIndex[mainModel->getOutputOperandIndex(i)] = i; } // mInputIndexStepModelToMainModel is ordered by step model input index and relies on // mModelInputs being the first inputs, as specified by mStepModelInputs. mInputIndexStepModelToMainModel.resize(mModelInputs.size()); std::transform(mModelInputs.begin(), mModelInputs.end(), mInputIndexStepModelToMainModel.begin(), [&mainModelOperandToInputIndex](auto& e) { uint32_t sourceOperandIndex = e.first; return mainModelOperandToInputIndex[sourceOperandIndex]; }); // mOutputIndexStepModelToMainModel is ordered by step model output index and relies on // mModelOutputs being the first outputs, as specified by mStepModelOutputs. mOutputIndexStepModelToMainModel.resize(mModelOutputs.size()); std::transform(mModelOutputs.begin(), mModelOutputs.end(), mOutputIndexStepModelToMainModel.begin(), [&mainModelOperandToOutputIndex](auto& e) { uint32_t sourceOperandIndex = e.first; return mainModelOperandToOutputIndex[sourceOperandIndex]; }); // mOutputsAsStepModelInputsIndexToMainModel is ordered by step model input index and relies // on mOutputsAsStepModelInputs being the first outputs. mOutputsAsStepModelInputsIndexToMainModel.resize(mOutputsAsStepModelInputs.size()); std::transform(mOutputsAsStepModelInputs.begin(), mOutputsAsStepModelInputs.end(), mOutputsAsStepModelInputsIndexToMainModel.begin(), [&mainModelOperandToOutputIndex](auto& e) { uint32_t sourceOperandIndex = e.first; return mainModelOperandToOutputIndex[sourceOperandIndex]; }); } if (VLOG_IS_ON(COMPILATION)) { logStepModel(); } std::vector inputs(mStepModelInputs.size()); std::vector outputs(mStepModelOutputs.size()); std::transform(mStepModelInputs.begin(), mStepModelInputs.end(), inputs.begin(), [](auto& e) { return e.second; }); std::transform(mStepModelOutputs.begin(), mStepModelOutputs.end(), outputs.begin(), [](auto& e) { return e.second; }); NN_RETURN_IF_ERROR(mStepModel.identifyInputsAndOutputs(inputs.size(), inputs.data(), outputs.size(), outputs.data())); NN_RETURN_IF_ERROR(mStepModel.finish()); // TODO: Move compilation elsewhere? VLOG(COMPILATION) << "ExecutionStep::finishStepModel, compilation on " << mDevice->getName(); return compile(*mDevice, mStepModel, executionPreference, priority, {}, *mPlan->getCacheInfo(), &mToken, &mPreparedStepModel); } void ExecutionStep::dump() const { if (VLOG_IS_ON(COMPILATION)) { VLOG(COMPILATION) << "Step#" << mIndex << ": execute on " << mDevice->getName(); logModelToInfo(mStepModel.makeModel()); } } std::ostream& operator<<(std::ostream& os, const IfStep& step) { return os << "Step#" << step.index << ": if " << toString(step.conditionOperandIndex) << " then=" << step.thenStepIndex << " else=" << step.elseStepIndex; } std::ostream& operator<<(std::ostream& os, const WhileStep& step) { return os << "Step#" << step.index << ": while cond=" << step.condStepIndex << " body=" << step.bodyStepIndex << " exit=" << step.exitStepIndex; } std::ostream& operator<<(std::ostream& os, const GotoStep& step) { return os << "Step#" << step.index << ": goto " << step.gotoStepIndex; } void LogicalStep::dump() const { if (VLOG_IS_ON(COMPILATION)) { if (const IfStep* step = tryIfStep()) { VLOG(COMPILATION) << *step; } else if (const WhileStep* step = tryWhileStep()) { VLOG(COMPILATION) << *step; } else if (const GotoStep* step = tryGotoStep()) { VLOG(COMPILATION) << *step; } else { executionStep()->dump(); } } } int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels, int32_t executionPreference, int32_t priority, const OptionalTimePoint& deadline, int simulateFailureResultCode) { CHECK(!mSuccessfulFinish); CHECK(!deadline.has_value()); const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels); auto containsUnknownSize = [sourceModels](const std::vector& operands) { for (const auto& sourceOperandIndex : operands) { const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first); const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second); if (hasUnknownSize(operand)) { return true; } } return false; }; findTempsAsStepModelOutputs(); for (const auto& logicalStep : mSteps) { if (ExecutionStep* step = logicalStep->tryExecutionStep()) { bool stepHasDynamicTemporaries = false; int n = step->finishStepModel(mainModel, &stepHasDynamicTemporaries, executionPreference, priority); if (stepHasDynamicTemporaries) { mHasDynamicTemporaries = true; if (step->getDevice()->getFeatureLevel() < kHalVersionV1_2ToApi.featureLevel) { // Until HAL 1.2, an Operand with lifetime SUBGRAPH_OUTPUT // must have fully specified dimensions either in the // Operand or in the RequestArgument. In the case of a // dynamic temporary, we won't be able to supply fully // specified dimensions in either. VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- step#" << step->getIndex() << " defines dynamic temporaries but is scheduled on pre-1.2 device " << step->getDevice()->getName(); if (n == ANEURALNETWORKS_NO_ERROR) { n = ANEURALNETWORKS_OP_FAILED; } } } if (n != ANEURALNETWORKS_NO_ERROR) { VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed"; return n; } } else if (IfStep* step = logicalStep->tryIfStep()) { // The partitioner does not support dynamic temporaries (b/132458982). CHECK(!containsUnknownSize(step->outerInputOperands)); CHECK(!containsUnknownSize(step->outerOutputOperands)); // step->conditionOperandIndex has a static shape. See b/158557728. CHECK(!containsUnknownSize(step->thenBranchInputOperands)); CHECK(!containsUnknownSize(step->thenBranchOutputOperands)); CHECK(!containsUnknownSize(step->elseBranchInputOperands)); CHECK(!containsUnknownSize(step->elseBranchOutputOperands)); } else if (WhileStep* step = logicalStep->tryWhileStep()) { // The partitioner does not support dynamic temporaries (b/132458982). CHECK(!containsUnknownSize(step->outerInputOperands)); CHECK(!containsUnknownSize(step->outerOutputOperands)); CHECK(!containsUnknownSize(step->condInputOperands)); // step->condOutputOperand has a static shape. See b/158557728. CHECK(!containsUnknownSize(step->bodyInputOperands)); CHECK(!containsUnknownSize(step->bodyOutputOperands)); } else { CHECK(logicalStep->isGoto()); } } if (simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) { VLOG(COMPILATION) << "ExecutionPlan::CompoundeBody::finish: simulating failure, ResultCode " << simulateFailureResultCode; return simulateFailureResultCode; } for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) { SourceOperandIndex index(kMainModelInSourceModels, mainModel->getInputOperandIndex(i)); mSourceOperandToInputIndex[index] = i; } for (uint32_t i = 0, n = mainModel->outputCount(); i < n; ++i) { SourceOperandIndex index(kMainModelInSourceModels, mainModel->getOutputOperandIndex(i)); mSourceOperandToOutputIndex[index] = i; } findControlFlowBoundaryConstants(sourceModels); findModelOutputsThatAreDownstreamInputs(); findMemoryStepRoles(); mSuccessfulFinish = true; return ANEURALNETWORKS_NO_ERROR; } void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants( const SourceModels* sourceModels) { auto handleBoundaryConstants = [this, sourceModels](const SourceOperandIndex& sourceOperandIndex) { const ModelBuilder* sourceModel = sourceModels->getModel(sourceOperandIndex.first); const Operand& operand = sourceModel->getOperand(sourceOperandIndex.second); const DataLocation& location = operand.location; if (operand.lifetime == Operand::LifeTime::CONSTANT_COPY) { mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = { .buffer = sourceModel->getPointerToOperandValue(location.offset), .length = location.length, }; } else if (operand.lifetime == Operand::LifeTime::POINTER) { mSourceOperandToBoundaryConstantCopy[sourceOperandIndex] = { .buffer = static_cast(std::get(location.pointer)), .length = location.length, }; } else if (operand.lifetime == Operand::LifeTime::CONSTANT_REFERENCE) { mSourceOperandToBoundaryConstantReference[sourceOperandIndex] = { .memory = sourceModel->getMemories()[location.poolIndex], .offset = location.offset, .length = location.length, }; } }; for (const auto& logicalStep : mSteps) { if (const IfStep* step = logicalStep->tryIfStep()) { handleBoundaryConstants(step->conditionOperandIndex); for (const auto& sourceOperandIndex : step->outerInputOperands) { handleBoundaryConstants(sourceOperandIndex); } } else if (const WhileStep* step = logicalStep->tryWhileStep()) { for (const auto& sourceOperandIndex : step->outerInputOperands) { handleBoundaryConstants(sourceOperandIndex); } } } } void ExecutionPlan::CompoundBody::findMemoryStepRoles() { mSourceOperandToStepRoles = StepRoleAnalyzer::analyze([this](StepRoleAnalyzer& analyzer) { for (const auto& logicalStep : mSteps) { if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { const auto& stepModelInputs = step->getStepModelInputs(); for (uint32_t i = 0; i < stepModelInputs.size(); i++) { const auto& [sourceIndex, stepIndex] = stepModelInputs[i]; analyzer.addRole(*step, sourceIndex, IOType::INPUT, i); } const auto& stepModelOutputs = step->getStepModelOutputs(); for (uint32_t i = 0; i < stepModelOutputs.size(); i++) { const auto& [sourceIndex, stepIndex] = stepModelOutputs[i]; analyzer.addRole(*step, sourceIndex, IOType::OUTPUT, i); } } else if (const IfStep* step = logicalStep->tryIfStep()) { // See ExecutionPlan::nextCompound(const IfStep*, ...). // // For interpreted IF operation, the outer input memories may be directly used by // the SUBGRAPH_INPUTs of the then and else model. CHECK_EQ(step->thenBranchInputOperands.size(), step->outerInputOperands.size()); CHECK_EQ(step->elseBranchInputOperands.size(), step->outerInputOperands.size()); for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) { analyzer.setUsedBy(step->outerInputOperands[i], step->thenBranchInputOperands[i]); analyzer.setUsedBy(step->outerInputOperands[i], step->elseBranchInputOperands[i]); } // For interpreted IF operation, the outer output memories may be directly used by // the SUBGRAPH_OUTPUTs of the then and else model. CHECK_EQ(step->thenBranchOutputOperands.size(), step->outerOutputOperands.size()); CHECK_EQ(step->elseBranchOutputOperands.size(), step->outerOutputOperands.size()); for (uint32_t i = 0; i < step->outerOutputOperands.size(); i++) { analyzer.setUsedBy(step->outerOutputOperands[i], step->thenBranchOutputOperands[i]); analyzer.setUsedBy(step->outerOutputOperands[i], step->elseBranchOutputOperands[i]); } } else if (const WhileStep* step = logicalStep->tryWhileStep()) { // See ExecutionPlan::nextCompound(const WhileStep*, ...). // // For interpreted WHILE operation, the following memories are involved: // a. the outer input memories to the WHILE operation // b. the outer output memories to the WHILE operation // c. the output memory of the condition model // d. one set of output memories of the body model // e. another set of output memories of the body model // // The memories are used in the following ways: // // - Condition model: // * In the first iteration: inputs use (a); output uses (c) // * In the following iterations: inputs use (d) or (e) for input-output and // state-only operands, and (a) for input-only operands; output uses (c) // // - Body model: // * In all iterations: inputs are the same as the condition model; outputs use // (d) or (e) // // Therefore, we configure the analyzer with the following used-by relationships: // - The outer input memories (a) may be directly used by the SUBGRAPH_INPUTs of // the condition model for all inputs in the first iteration, as well as the // input-only operands in the following iterations. CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size()); for (uint32_t i = 0; i < step->outerInputOperands.size(); i++) { analyzer.setUsedBy(step->outerInputOperands[i], step->condInputOperands[i]); } // - The output memories of the body model (d) and (e) may be directly used by the // SUBGRAPH_INPUTs of the condition model for input-output and state-only operands // after the first iteration. CHECK_GE(step->condInputOperands.size(), step->bodyOutputOperands.size()); for (uint32_t i = 0; i < step->bodyOutputOperands.size(); i++) { analyzer.setUsedBy(step->bodyOutputOperands[i], step->condInputOperands[i]); } // - The SUBGRAPH_INPUTs of the condition model are directly used by the // SUBGRAPH_INPUTs of the body model for all inputs in all iterations. CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size()); for (uint32_t i = 0; i < step->bodyInputOperands.size(); i++) { analyzer.setUsedBy(step->condInputOperands[i], step->bodyInputOperands[i]); } } else if (logicalStep->isGoto()) { // Nothing to do. } else { CHECK(false) << "Unexpected LogicalStep kind"; } } }); } int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference, int32_t priority, const OptionalTimePoint& deadline, int simulateFailureResultCode) { CHECK(!mSuccessfulFinish); CHECK(mDevice != nullptr); VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation"; int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheInfo, &mToken, &mPreparedModel); if (n == ANEURALNETWORKS_NO_ERROR && simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) { VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish: simulating failure, ResultCode " << simulateFailureResultCode; n = simulateFailureResultCode; } mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR); return n; } int ExecutionPlan::finish(int32_t executionPreference, int32_t priority, const OptionalTimePoint& deadline, int simulateFailureResultCode) { CHECK(mBody != nullptr); return mBody->finish(&getSourceModels(), executionPreference, priority, deadline, simulateFailureResultCode); } ExecutionPlan::Controller::Controller( const ExecutionPlan* plan, ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries, std::map sourceOperandToLocationOfTemporary, std::map sourceOperandToLocationOfTemporary2, std::map sourceOperandToInputIndex, std::map sourceOperandToOutputIndex, const std::map& sourceOperandToConstantCopy, std::map sourceOperandToConstantReference, DynamicTemporaries dynamicTemporaries) : mPlan(plan), mExecutionBuilder(executionBuilder), mBurstBuilder(burstBuilder), mSourceOperandToLocationOfTemporary(std::move(sourceOperandToLocationOfTemporary)), mSourceOperandToLocationOfTemporary2(std::move(sourceOperandToLocationOfTemporary2)), mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)), mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)), mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)), mDynamicTemporaries(std::move(dynamicTemporaries)), mNextStepIndex(0), mFallbackNextStepIndex(kBadStepIndex), mLastStepSyncFd(-1) { if (totalSizeOfTemporaries == 0) { return; } int n; std::tie(n, mTemporaries) = MemoryAshmem::create(totalSizeOfTemporaries); if (n != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries"; mNextStepIndex = kBadStepIndex; } for (const auto& [sourceOperandIndex, location] : sourceOperandToConstantCopy) { memcpy(mTemporaries->getPointer() + mSourceOperandToLocationOfTemporary[sourceOperandIndex].offset, location.buffer, location.length); } } // Attempt to create a burst object for each PreparedModel/Partition. If the // burst controller object cannot be made, return a nullptr in its place to // indicate the regular execution path should be used. This can occur either // because PreparedModel was nullptr (cpu was best choice), or because the // IPreparedModel was of insufficient version or failed to configure the burst. std::vector ExecutionPlan::makeBursts() const { switch (mState) { // burst object for each partition in the compound case case COMPOUND: { std::vector bursts; bursts.reserve(compound()->mSteps.size()); for (const auto& logicalStep : compound()->mSteps) { if (!logicalStep->isExecution()) { bursts.push_back(nullptr); continue; } if (const auto preparedModel = logicalStep->executionStep()->getPreparedStepModel()) { const auto maybeBurst = preparedModel->configureExecutionBurst(); if (!maybeBurst.has_value()) { LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with " << maybeBurst.error().code << ": " << maybeBurst.error().message; } bursts.push_back(maybeBurst.value_or(nullptr)); } else { bursts.push_back(nullptr); } } return bursts; } // single burst object for the simple case case SIMPLE: { std::vector burst; auto simpleBody = simple(); if (const auto preparedModel = simpleBody->mPreparedModel) { const auto maybeBurst = preparedModel->configureExecutionBurst(); if (!maybeBurst.has_value()) { LOG(ERROR) << "preparedModel->configureExecutionBurst() failed with " << maybeBurst.error().code << ": " << maybeBurst.error().message; } burst.push_back(maybeBurst.value_or(nullptr)); } else { burst.push_back(nullptr); } return burst; } // no burst objects made default: return {}; } } std::shared_ptr ExecutionPlan::makeController( ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const { CHECK(isValid()); CHECK(mState != SIMPLE); const auto* body = compound(); // Create the layout for a RuntimeMemory object big enough to hold // - every partition boundary TEMPORARY operand that is not a dynamic temporary, and // - buffers required by the control flow implementation. // // TODO: Rethink this approach for managing temporaries. Some // alternatives: // // 1) Adopt a memory layout scheme analogous to stack allocation, // where objects of non-overlapping lifetime can occupy the same // storage. We would still have a single Memory object in this // case. // // 2) Do something like what CpuExecutor does, and do allocations // and deallocations on the fly (during execution) before first // reference and after last reference, respectively. This would // mean having one Memory object per TEMPORARY; or, in a more // complicated implementation, one Memory object per set of // temporaries that have the same lifetime. Note that the Android // system limits the number of shared memory objects, which are // what our Memory objects represent. // uint32_t totalSizeOfTemporaries = 0; // This function has two modes of operation: // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for // TEMPORARY_VARIABLE source operands that are not dynamic temporaries, // skip TEMPORARY_VARIABLE source operands that are dynamic temporaries, // skip SUBGRAPH_OUTPUT source operands, and panic if we see a source // operand of another lifetime. // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for // SUBGRAPH_OUTPUT source operands and panic if we see a source operand // of another lifetime. auto mapTemporary = [body, executionBuilder, &totalSizeOfTemporaries]( const SourceOperandIndex& sourceOperandIndex, std::map* sourceOperandToLocationOfTemporary, Operand::LifeTime lifetime = Operand::LifeTime::TEMPORARY_VARIABLE) { CHECK(lifetime == Operand::LifeTime::TEMPORARY_VARIABLE || lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT); const Operand& sourceOperand = executionBuilder->getSourceOperand(sourceOperandIndex); if (lifetime == Operand::LifeTime::TEMPORARY_VARIABLE && sourceOperand.lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) { // See the caller for explanation. return; } CHECK_EQ(sourceOperand.lifetime, lifetime); const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand); if (size != 0u) { const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex); const auto loc = addTemporary(&totalSizeOfTemporaries, size, memoryPreference.alignment, memoryPreference.padding); auto [_, isNew] = sourceOperandToLocationOfTemporary->emplace(sourceOperandIndex, loc); CHECK(isNew); VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex) << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength; } else { // Unknown size, hence dynamic temporary. The mapping will // be established elsewhere (DynamicTemporaries::allocate()). CHECK_EQ(lifetime, Operand::LifeTime::TEMPORARY_VARIABLE); CHECK_EQ(sourceOperand.lifetime, Operand::LifeTime::TEMPORARY_VARIABLE); } }; std::map sourceOperandToLocationOfTemporary; std::map sourceOperandToLocationOfTemporary2; for (const auto& logicalStep : body->mSteps) { if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { // Allocate memory for ExecutionStep temporary outputs that are // inputs to other steps, as determined by // ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs(). // // We don't allocate memory for step model output operands with // source operand lifetime SUBGRAPH_OUTPUT because they will be // - managed by the client (main model outputs), // - assigned a location of another operand (when this step model // output is a branch model output of an IF; see // ExecutionPlan::nextCompound(const IfStep*, ...)), or // - allocated by a WHILE (when this step model output // is a condition or body model output of a WHILE; see the // step->bodyOutputOperands and step->condOutputOperand handling // below). for (const auto& output : step->getTempsAsStepModelOutputs()) { mapTemporary(SourceOperandIndex(step->getSourceModelIndex(), output.first), &sourceOperandToLocationOfTemporary); } } else if (const IfStep* step = logicalStep->tryIfStep()) { // Allocate memory for all temporary outputs of an IfStep because // they are going to be written to by a branch model. We don't // perform unused output operand optimisation for referenced models. // // We don't allocate memory for branch output operands because they // use the same location as the corresponding outer output operands, // as established in ExecutionPlan::nextCompound(const IfStep*, ...) // // We don't allocate memory for outer output operands with source // operand lifetime SUBGRAPH_OUTPUT because they will be // - managed by the client (main model outputs), // - assigned a location of another operand (when this IF outer // output is a branch model output of another IF; see // ExecutionPlan::nextCompound(const IfStep*, ...)), or // - allocated by a WHILE (when this IF outer output // is a condition or body model output of a WHILE; see the // step->bodyOutputOperands and step->condOutputOperand handling // below). for (const auto& sourceOperandIndex : step->outerOutputOperands) { mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary); } } else if (const WhileStep* step = logicalStep->tryWhileStep()) { // Allocate memory for all temporary outputs of an WhileStep because // they are going to be written to by the WHILE loop. // // We don't allocate memory for outer output operands with source // operand lifetime SUBGRAPH_OUTPUT because they will be // - managed by the client (main model outputs), // - assigned a location of another operand (when this WHILE outer // output is a branch model output of an IF; see // ExecutionPlan::nextCompound(const IfStep*, ...)), or // - allocated by another WHILE (when this WHILE outer output // is a condition or body model output of another WHILE; see the // step->bodyOutputOperands and step->condOutputOperand handling // below). for (const auto& sourceOperandIndex : step->outerOutputOperands) { mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary); } // Allocate memory for body model outputs. Note that we could use // the outer output operand memory instead but we currently don't do // so (b/148206073). for (const auto& sourceOperandIndex : step->bodyOutputOperands) { mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary, Operand::LifeTime::SUBGRAPH_OUTPUT); // Allocate another set of temporaries for double buffering. mapTemporary(sourceOperandIndex, &sourceOperandToLocationOfTemporary2, Operand::LifeTime::SUBGRAPH_OUTPUT); } // Allocate memory for condition model output. // TODO: Share one condition output memory region between all loops. mapTemporary(step->condOutputOperand, &sourceOperandToLocationOfTemporary, Operand::LifeTime::SUBGRAPH_OUTPUT); } else { CHECK(logicalStep->isGoto()); } } // Allocate temporary memory for boundary CONSTANT_COPY operands. for (const auto& [sourceOperandIndex, location] : body->mSourceOperandToBoundaryConstantCopy) { const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex); const auto loc = addTemporary(&totalSizeOfTemporaries, location.length, memoryPreference.alignment, memoryPreference.padding); sourceOperandToLocationOfTemporary.emplace(sourceOperandIndex, loc); VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex) << " offset = " << loc.offset << " paddedLength = " << loc.paddedLength; } // Collect dynamic temporaries. // TODO(b/157236079): Move some or all of this work to compilation time? DynamicTemporaries dynamicTemporaries; const TypeManager* typeManager = TypeManager::get(); forEachDynamicTemporary([body, typeManager, &dynamicTemporaries]( SourceOperandIndex sourceOperandIndex, const Operand& sourceOperand, uint32_t definingStepIndex) { CHECK(typeManager->isTensorType(sourceOperand.type)); const auto memoryPreference = body->getMemoryPreferenceOfSourceOperand(sourceOperandIndex); // TODO: For now we guess an initial size equal to element // size, which is overly conservative. const uint32_t size = typeManager->getSizeOfData(sourceOperand.type, {1}); dynamicTemporaries.declare(sourceOperandIndex, definingStepIndex, sourceOperand.dimensions, size, memoryPreference.alignment, memoryPreference.padding); }); dynamicTemporaries.endDeclarations(); dynamicTemporaries.vlogDump("finished declarations"); return std::shared_ptr(new Controller( this, executionBuilder, burstBuilder, totalSizeOfTemporaries, std::move(sourceOperandToLocationOfTemporary), std::move(sourceOperandToLocationOfTemporary2), body->mSourceOperandToInputIndex, body->mSourceOperandToOutputIndex, body->mSourceOperandToBoundaryConstantCopy, body->mSourceOperandToBoundaryConstantReference, std::move(dynamicTemporaries))); } // TODO: Find a better way to provide this functionality. int ExecutionPlan::fallback(std::shared_ptr controller, std::shared_ptr* executor, SharedBurst* burstController, const std::vector* mainModelOutputShapes) const { *executor = nullptr; if (burstController != nullptr) { *burstController = nullptr; } VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor) << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex; if (controller->mFallbackNextStepIndex == Controller::kBadStepIndex) { // We haven't called next(). return ANEURALNETWORKS_OP_FAILED; } if (controller->mNextStepIndex == Controller::kBadStepIndex) { // The last call to next() did not produce an executor. return ANEURALNETWORKS_OP_FAILED; } controller->mNextStepIndex = controller->mFallbackNextStepIndex; return next(controller, executor, burstController, mainModelOutputShapes); } ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size) : mInfo(RunTimePoolInfo::createFromExistingBuffer(static_cast(pointer), size)), mOffset(0) {} ExecutionPlan::Buffer::Buffer(RunTimePoolInfo info, uint32_t offset) : mInfo(std::move(info)), mOffset(offset) {} void* ExecutionPlan::Buffer::getPointer() const { return mInfo.getBuffer() + mOffset; } uint32_t ExecutionPlan::Buffer::getSize() const { return mInfo.getSize() - mOffset; } void ExecutionPlan::Buffer::flush() const { mInfo.flush(); } std::optional ExecutionPlan::getBufferFromModelArgumentInfo( const ModelArgumentInfo& info, const ExecutionBuilder* executionBuilder) const { switch (info.state()) { case ModelArgumentInfo::POINTER: { return Buffer(info.buffer(), info.length()); } break; case ModelArgumentInfo::MEMORY: { if (std::optional poolInfo = executionBuilder->getRunTimePoolInfo(info.locationAndLength().poolIndex)) { return Buffer(*poolInfo, info.locationAndLength().offset); } else { LOG(ERROR) << "Unable to map operand memory pool"; return std::nullopt; } } break; case ModelArgumentInfo::HAS_NO_VALUE: { LOG(ERROR) << "Attempting to read an operand that has no value"; return std::nullopt; } break; default: { LOG(ERROR) << "Unexpected operand memory state: " << static_cast(info.state()); return std::nullopt; } break; } } std::optional ExecutionPlan::getBuffer( std::shared_ptr controller, SourceOperandIndex operandIndex) const { const auto& sourceOperandToLocationOfTemporary = controller->mSourceOperandToLocationOfTemporary; const auto& sourceOperandToInputIndex = controller->mSourceOperandToInputIndex; const auto& sourceOperandToOutputIndex = controller->mSourceOperandToOutputIndex; const auto& sourceOperandToConstantReference = controller->mSourceOperandToConstantReference; if (auto it = sourceOperandToLocationOfTemporary.find(operandIndex); it != sourceOperandToLocationOfTemporary.end()) { const uint32_t offset = it->second.offset; const std::unique_ptr& memory = controller->mTemporaries; return Buffer(memory->getPointer() + offset, memory->getSize() - offset); } else if (auto it = sourceOperandToInputIndex.find(operandIndex); it != sourceOperandToInputIndex.end()) { const ModelArgumentInfo& info = controller->mExecutionBuilder->getInputInfo(it->second); return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder); } else if (auto it = sourceOperandToOutputIndex.find(operandIndex); it != sourceOperandToOutputIndex.end()) { const ModelArgumentInfo& info = controller->mExecutionBuilder->getOutputInfo(it->second); return getBufferFromModelArgumentInfo(info, controller->mExecutionBuilder); } else if (auto it = sourceOperandToConstantReference.find(operandIndex); it != sourceOperandToConstantReference.end()) { const ConstantReferenceLocation& location = it->second; const std::optional info = location.memory->getRunTimePoolInfo(); if (info == std::nullopt) { return std::nullopt; } return Buffer(info->getBuffer() + location.offset, location.length); } return std::nullopt; } int ExecutionPlan::readConditionValue(std::shared_ptr controller, SourceOperandIndex operandIndex, bool* value) const { std::optional buffer = getBuffer(controller, operandIndex); if (buffer == std::nullopt) { LOG(ERROR) << "Unable to read operand " << toString(operandIndex); return ANEURALNETWORKS_OP_FAILED; } CHECK_GE(buffer->getSize(), sizeof(bool8)); bool8 value8 = *static_cast(buffer->getPointer()); *value = static_cast(value8); VLOG(EXECUTION) << "readConditionValue: " << *value; return ANEURALNETWORKS_NO_ERROR; } int ExecutionPlan::next(std::shared_ptr controller, std::shared_ptr* executor, SharedBurst* burstController, const std::vector* mainModelOutputShapes, int syncFdOfLastStep) const { CHECK(mState == COMPOUND); controller->mLastStepSyncFd = syncFdOfLastStep; *executor = nullptr; if (burstController != nullptr) { *burstController = nullptr; } VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor) << "): mNextStepIndex = " << controller->mNextStepIndex; if (controller->mNextStepIndex == Controller::kBadStepIndex) { return ANEURALNETWORKS_OP_FAILED; } return nextCompound(controller, executor, burstController, mainModelOutputShapes); } int ExecutionPlan::nextCompound(std::shared_ptr controller, std::shared_ptr* executor, SharedBurst* burstController, const std::vector* mainModelOutputShapes) const { if (controller->mNextStepIndex == Controller::kBadStepIndex) { return ANEURALNETWORKS_OP_FAILED; } auto compoundBody = compound(); if (controller->mNextStepIndex == compoundBody->mSteps.size()) { controller->mNextStepIndex = Controller::kBadStepIndex; // end return ANEURALNETWORKS_NO_ERROR; } const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex]; if (const IfStep* step = logicalStep->tryIfStep()) { return nextCompound(step, controller, executor, burstController, mainModelOutputShapes); } else if (const WhileStep* step = logicalStep->tryWhileStep()) { return nextCompound(step, controller, executor, burstController, mainModelOutputShapes); } else if (const GotoStep* step = logicalStep->tryGotoStep()) { return nextCompound(step, controller, executor, burstController, mainModelOutputShapes); } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { return nextCompound(step, controller, executor, burstController, mainModelOutputShapes); } else { CHECK(false) << "Unknown step variant"; return ANEURALNETWORKS_BAD_STATE; } } int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr controller, std::shared_ptr* executor, SharedBurst* burstController, const std::vector* mainModelOutputShapes) const { VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on " << step->getDevice()->getName(); NN_RETURN_IF_ERROR(controller->mDynamicTemporaries.allocate(step->getIndex())); controller->mDynamicTemporaries.vlogDump("finished allocating for a step"); *executor = std::make_shared(controller->mExecutionBuilder, step->getStepModel(), step->getDevice(), step->getPreparedStepModel(), /*reusable=*/false, step, &controller->mDynamicTemporaries); step->mapInputsAndOutputs( *executor, mainModelOutputShapes, controller->mTemporaries.get(), controller->mSourceOperandToLocationOfTemporary, controller->mDynamicTemporaries, controller->mSourceOperandToInputIndex, controller->mSourceOperandToOutputIndex, controller->mSourceOperandToConstantReference); if (burstController != nullptr && controller->mBurstBuilder != nullptr) { *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex); } controller->mFallbackNextStepIndex = controller->mNextStepIndex; controller->mNextStepIndex++; return ANEURALNETWORKS_NO_ERROR; } // The first argument is the "source" operand, the second operand is the "destination". void ExecutionPlan::Controller::setInput(const SourceOperandIndex& outerOperand, const SourceOperandIndex& innerOperand) { VLOG(EXECUTION) << "mapping input " << toString(innerOperand) << " from " << toString(outerOperand); #ifdef NN_DEBUGGABLE CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) + mSourceOperandToInputIndex.count(innerOperand) + mSourceOperandToOutputIndex.count(innerOperand) + mSourceOperandToConstantReference.count(innerOperand), 1u); #endif mSourceOperandToLocationOfTemporary.erase(innerOperand); mSourceOperandToInputIndex.erase(innerOperand); mSourceOperandToOutputIndex.erase(innerOperand); mSourceOperandToConstantReference.erase(innerOperand); if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand); it != mSourceOperandToLocationOfTemporary.end()) { mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second); } else if (auto it = mSourceOperandToInputIndex.find(outerOperand); it != mSourceOperandToInputIndex.end()) { mSourceOperandToInputIndex.emplace(innerOperand, it->second); } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand); it != mSourceOperandToOutputIndex.end()) { mSourceOperandToOutputIndex.emplace(innerOperand, it->second); } else if (auto it = mSourceOperandToConstantReference.find(outerOperand); it != mSourceOperandToConstantReference.end()) { mSourceOperandToConstantReference.emplace(innerOperand, it->second); } else { CHECK(false) << "Cannot set step model input operand " << toString(innerOperand) << " from operand " << toString(outerOperand); } } // The first argument is the "source" operand, the second operand is the "destination". void ExecutionPlan::Controller::setOutput(const SourceOperandIndex& outerOperand, const SourceOperandIndex& innerOperand) { VLOG(EXECUTION) << "mapping output " << toString(innerOperand) << " from " << toString(outerOperand); #ifdef NN_DEBUGGABLE CHECK_LE(mSourceOperandToLocationOfTemporary.count(innerOperand) + mSourceOperandToOutputIndex.count(innerOperand), 1u); #endif mSourceOperandToLocationOfTemporary.erase(innerOperand); mSourceOperandToOutputIndex.erase(innerOperand); if (auto it = mSourceOperandToLocationOfTemporary.find(outerOperand); it != mSourceOperandToLocationOfTemporary.end()) { mSourceOperandToLocationOfTemporary.emplace(innerOperand, it->second); } else if (auto it = mSourceOperandToOutputIndex.find(outerOperand); it != mSourceOperandToOutputIndex.end()) { mSourceOperandToOutputIndex.emplace(innerOperand, it->second); } else { CHECK(false) << "Cannot set step model output operand " << toString(innerOperand) << " from operand " << toString(outerOperand); } } int ExecutionPlan::Controller::waitForLastStepSyncFence() const { if (mLastStepSyncFd == -1) { return ANEURALNETWORKS_NO_ERROR; } VLOG(EXECUTION) << "wait for mLastStepSyncFd " << mLastStepSyncFd; auto r = syncWait(mLastStepSyncFd, -1); int n = ANEURALNETWORKS_NO_ERROR; if (r != FenceState::SIGNALED) { LOG(ERROR) << "syncWait failed, fd: " << mLastStepSyncFd; n = ANEURALNETWORKS_OP_FAILED; } return n; } // Invocations of Controller::setInput/setOutput in this function must match with invocations of // StepRoleAnalyzer::setUsedBy in the IfStep branch in // ExecutionPlan::CompoundBody::findMemoryStepRoles. int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr controller, std::shared_ptr* executor, SharedBurst* burstController, const std::vector* mainModelOutputShapes) const { VLOG(EXECUTION) << "next: " << *step; // If the last step has a sync fence, wait for it to signal before reading the condition value. // This is safe because the steps are serialized when doing fenced compute. NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence()); bool condValue; NN_RETURN_IF_ERROR(readConditionValue(controller, step->conditionOperandIndex, &condValue)); controller->mNextStepIndex = condValue ? step->thenStepIndex : step->elseStepIndex; const std::vector& branchInputOperands = condValue ? step->thenBranchInputOperands : step->elseBranchInputOperands; const std::vector& branchOutputOperands = condValue ? step->thenBranchOutputOperands : step->elseBranchOutputOperands; CHECK_EQ(branchInputOperands.size(), step->outerInputOperands.size()); CHECK_EQ(branchOutputOperands.size(), step->outerOutputOperands.size()); for (uint32_t i = 0, n = step->outerInputOperands.size(); i < n; ++i) { // We have to do this assignment just before executing this step to // accommodate cases when the IF resides within a WHILE condition or // body model and for some j the i-th input of the IF branch model is // - an input of the WHILE condition model (whileStep->condInputOperands[j]), // - an input of the WHILE body model (whileStep->bodyInputOperands[j]), or // - an output of the WHILE body model (whileStep->bodyOutputOperands[j]). // In such cases, the WhileStep modifies the location of // step->outerInputOperands[i] to implement double buffering. controller->setInput(step->outerInputOperands[i], branchInputOperands[i]); } for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) { // We have to do this assignment just before executing this step to // accommodate the case when the IF resides within a WHILE body // model and the i-th output of the IF branch model is an // output of the WHILE body model (whileStep->bodyOutputOperands[j] for // some j). In that case, the WhileStep modifies the location of // step->outerOutputOperands[i] to implement double buffering. controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]); } return nextCompound(controller, executor, burstController, mainModelOutputShapes); } // Invocations of Controller::setInput in this function must match with invocations of // StepRoleAnalyzer::setUsedBy in the WhileStep branch in // ExecutionPlan::CompoundBody::findMemoryStepRoles. int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr controller, std::shared_ptr* executor, SharedBurst* burstController, const std::vector* mainModelOutputShapes) const { WhileState& state = controller->mWhileState[controller->mNextStepIndex]; if (state.stage == WhileState::EVALUATE_CONDITION) { state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1; VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration << ": evaluating condition"; controller->mNextStepIndex = step->condStepIndex; if (state.iteration == 0) { state.startTime = Clock::now(); } // iteration = 0 cond inputs = outer inputs // iteration = 1 cond inputs = body outputs // iteration = 2 cond inputs = body outputs // iteration = 3 cond inputs = ... uint32_t loopBodyOutputCount = step->bodyOutputOperands.size(); CHECK_EQ(step->condInputOperands.size(), step->outerInputOperands.size()); CHECK_GE(step->condInputOperands.size(), loopBodyOutputCount); for (uint32_t i = 0, n = step->condInputOperands.size(); i < n; ++i) { bool operandIsInputOnly = i >= loopBodyOutputCount; controller->setInput((state.iteration == 0 || operandIsInputOnly) ? step->outerInputOperands[i] : step->bodyOutputOperands[i], step->condInputOperands[i]); } state.stage = WhileState::EVALUATE_BODY; return nextCompound(controller, executor, burstController, mainModelOutputShapes); } CHECK(state.stage == WhileState::EVALUATE_BODY); std::chrono::nanoseconds timeoutDuration( controller->mExecutionBuilder->getLoopTimeoutDuration()); auto duration = Clock::now() - state.startTime; if (duration > timeoutDuration) { LOG(ERROR) << "WHILE loop timed out after " << std::chrono::duration_cast(duration).count() << " ms"; return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT; } // If the last step has a sync fence, wait for it to signal before reading the condition value. // This is safe because the steps are serialized when doing fenced compute. NN_RETURN_IF_ERROR(controller->waitForLastStepSyncFence()); bool condValue; NN_RETURN_IF_ERROR(readConditionValue(controller, step->condOutputOperand, &condValue)); if (condValue) { VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration << ": evaluating body"; controller->mNextStepIndex = step->bodyStepIndex; // iteration = 0 body inputs = cond inputs = outer inputs body outputs = tmp1 // iteration = 1 body inputs = cond inputs = tmp1 body outputs = tmp2 // iteration = 2 body inputs = cond inputs = tmp2 body outputs = tmp1 // iteration = 3 body inputs = cond inputs = ... body outputs = ... #ifdef NN_DEBUGGABLE CHECK_GE(step->bodyInputOperands.size(), step->bodyOutputOperands.size()); CHECK_EQ(step->bodyInputOperands.size(), step->outerInputOperands.size()); CHECK_EQ(step->bodyInputOperands.size(), step->condInputOperands.size()); CHECK_GE(step->bodyOutputOperands.size(), step->outerOutputOperands.size()); #endif for (uint32_t i = 0, n = step->bodyInputOperands.size(); i < n; ++i) { controller->setInput(step->condInputOperands[i], step->bodyInputOperands[i]); } if (state.iteration != 0) { for (const SourceOperandIndex& outputOperand : step->bodyOutputOperands) { #ifdef NN_DEBUGGABLE CHECK_EQ(controller->mSourceOperandToInputIndex.count(outputOperand), 0u); CHECK_EQ(controller->mSourceOperandToOutputIndex.count(outputOperand), 0u); CHECK_EQ(controller->mSourceOperandToLocationOfTemporary.count(outputOperand), 1u); CHECK_EQ(controller->mSourceOperandToLocationOfTemporary2.count(outputOperand), 1u); #endif std::swap(controller->mSourceOperandToLocationOfTemporary[outputOperand], controller->mSourceOperandToLocationOfTemporary2[outputOperand]); } } } else { VLOG(EXECUTION) << "next: " << *step << ": iteration " << state.iteration << ": exiting loop"; controller->mNextStepIndex = step->exitStepIndex; // Copy body outputs to outer outputs. // TODO: Use outer outputs instead of tmp2 to avoid copying? CHECK_LE(step->outerOutputOperands.size(), step->bodyOutputOperands.size()); for (uint32_t i = 0, n = step->outerOutputOperands.size(); i < n; ++i) { // condInputOperands[i] points to a body output operand from the // last iteration if we've executed at least one iteration and to a // WHILE operation input operand otherwise. const SourceOperandIndex& innerOperand = step->condInputOperands[i]; const SourceOperandIndex& outerOperand = step->outerOutputOperands[i]; std::optional outerBuffer = getBuffer(controller, outerOperand); if (outerBuffer == std::nullopt) { // This should never happen. LOG(ERROR) << "Unable to get outerBuffer for operand " << toString(outerOperand); return ANEURALNETWORKS_OP_FAILED; } const Operand& sourceOperand = controller->mExecutionBuilder->getSourceOperand(outerOperand); const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand); CHECK_NE(size, 0u); std::optional innerBuffer = getBuffer(controller, innerOperand); if (innerBuffer == std::nullopt) { // This should never happen. LOG(ERROR) << "Unable to get innerBuffer for operand " << toString(innerOperand); return ANEURALNETWORKS_OP_FAILED; } CHECK_LE(size, innerBuffer->getSize()); CHECK_LE(size, outerBuffer->getSize()); memcpy(outerBuffer->getPointer(), innerBuffer->getPointer(), size); outerBuffer->flush(); } state.iteration = WhileState::kOutsideLoop; } state.stage = WhileState::EVALUATE_CONDITION; return nextCompound(controller, executor, burstController, mainModelOutputShapes); } int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr controller, std::shared_ptr* executor, SharedBurst* burstController, const std::vector* mainModelOutputShapes) const { VLOG(EXECUTION) << "next: " << *step; controller->mNextStepIndex = step->gotoStepIndex; return nextCompound(controller, executor, burstController, mainModelOutputShapes); } std::shared_ptr ExecutionPlan::makeStepExecutor( bool reusable, ExecutionBuilder* executionBuilder) const { auto simpleBody = simple(); auto executor = std::make_shared(executionBuilder, simpleBody->mModel, simpleBody->mDevice, simpleBody->mPreparedModel, reusable); executor->mapInputsAndOutputsTrivially(); return executor; } void ExecutionPlan::becomeCompoundIfEmpty() { CHECK(mState != SIMPLE); if (mState == EMPTY) { mBody = new CompoundBody(this); mState = COMPOUND; } } ExecutionStep* ExecutionPlan::createNewExecutionStep(uint32_t sourceModelIndex, const std::shared_ptr device) { becomeCompoundIfEmpty(); auto step = std::make_shared(std::in_place_type, this, compound()->mSteps.size(), sourceModelIndex, device); compound()->mSteps.push_back(step); return step->executionStep(); } IfStep* ExecutionPlan::createNewIfStep() { becomeCompoundIfEmpty(); auto step = std::make_shared(std::in_place_type); step->ifStep()->index = compound()->mSteps.size(); compound()->mSteps.push_back(step); return step->ifStep(); } WhileStep* ExecutionPlan::createNewWhileStep() { becomeCompoundIfEmpty(); auto step = std::make_shared(std::in_place_type); step->whileStep()->index = compound()->mSteps.size(); compound()->mSteps.push_back(step); return step->whileStep(); } GotoStep* ExecutionPlan::createNewGotoStep() { becomeCompoundIfEmpty(); auto step = std::make_shared(std::in_place_type); step->gotoStep()->index = compound()->mSteps.size(); compound()->mSteps.push_back(step); return step->gotoStep(); } void ExecutionPlan::becomeSingleStep(const std::shared_ptr device, const ModelBuilder* model) { CHECK(mState == EMPTY); mBody = new SimpleBody(device, model, mCacheInfo, mToken); mState = SIMPLE; } void ExecutionPlan::recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) { auto [it, isNew] = compound()->mOutputToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex); CHECK(isNew) << "Step " << stepIndex << " redefines output operand " << toString(sourceOperandIndex) << " already defined by step " << it->second; } void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) { auto [it, isNew] = compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex); CHECK(isNew) << "Step " << stepIndex << " redefines temporary operand " << toString(sourceOperandIndex) << " already defined by step " << it->second; } void ExecutionPlan::dump() const { if (mBody) { mBody->dump(); } else { VLOG(COMPILATION) << "EMPTY"; } } void ExecutionPlan::reset() { if (mBody) { delete mBody; mBody = nullptr; } mState = EMPTY; } bool ExecutionPlan::isSimpleCpu() const { return isSimple() && simple()->mDevice == DeviceManager::getCpuDevice(); } ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const { switch (mState) { case EMPTY: return Kind::EMPTY; case SIMPLE: nnAssert(mBody); return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR; case COMPOUND: nnAssert(mBody); return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR; default: nnAssert(!"unexpected state"); return Kind::ERROR; } } std::shared_ptr ExecutionPlan::forTest_simpleGetDevice() const { return simple()->mDevice; } const std::vector>& ExecutionPlan::forTest_compoundGetSteps() const { return compound()->mSteps; } std::set ExecutionPlan::forTest_flatGetDynamicTemporaries() const { CHECK_EQ(getSourceModels().size(), size_t(1)); std::set ret; forEachDynamicTemporary([&ret](SourceOperandIndex dynTemp, const Operand&, uint32_t) { ret.insert(dynTemp.second); }); return ret; } bool ExecutionPlan::hasDynamicTemporaries() const { return mBody->hasDynamicTemporaries(); } bool ExecutionPlan::forTest_hasStepModelWithNoInputsOrNoOutputs() const { return mBody->hasStepModelWithNoInputsOrNoOutputs(); } bool ExecutionPlan::CompoundBody::hasStepModelWithNoInputsOrNoOutputs() const { return std::any_of(mSteps.begin(), mSteps.end(), [](const auto& logicalStep) { const ExecutionStep* step = logicalStep->tryExecutionStep(); return step != nullptr && step->hasNoInputsOrNoOutputs(); }); } const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const { return simple()->mToken.getCacheToken(); } void ExecutionPlan::SimpleBody::dump() const { VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName(); } void ExecutionPlan::CompoundBody::dump() const { for (const auto& step : mSteps) { step->dump(); } } SourceOperandIndex ExecutionPlan::getInputSourceOperand(uint32_t index) const { const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels); CHECK_LT(index, mainModel->inputCount()); const auto operandIndex = mainModel->getInputOperandIndex(index); return {kMainModelInSourceModels, operandIndex}; } SourceOperandIndex ExecutionPlan::getOutputSourceOperand(uint32_t index) const { const auto* mainModel = getSourceModels().getModel(kMainModelInSourceModels); CHECK_LT(index, mainModel->outputCount()); const auto operandIndex = mainModel->getOutputOperandIndex(index); return {kMainModelInSourceModels, operandIndex}; } void ExecutionPlan::SimpleBody::forEachStepRoleOfInput(uint32_t index, const StepRoleCallback& callback) const { callback(mPreparedModel.get(), IOType::INPUT, index); } void ExecutionPlan::SimpleBody::forEachStepRoleOfOutput(uint32_t index, const StepRoleCallback& callback) const { callback(mPreparedModel.get(), IOType::OUTPUT, index); } // Map an input role of the main model to the input/output roles in the step models. void ExecutionPlan::CompoundBody::forEachStepRoleOfInput(uint32_t index, const StepRoleCallback& callback) const { const auto sourceOperandIndex = mPlan->getInputSourceOperand(index); forEachStepRoleOfSourceOperand(sourceOperandIndex, callback); } // Map an output role of the main model to the input/output roles in the step models. void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index, const StepRoleCallback& callback) const { const auto sourceOperandIndex = mPlan->getOutputSourceOperand(index); forEachStepRoleOfSourceOperand(sourceOperandIndex, callback); } void ExecutionPlan::CompoundBody::forEachStepRoleOfSourceOperand( const SourceOperandIndex& index, const StepRoleCallback& callback) const { const auto it = mSourceOperandToStepRoles.find(index); if (it == mSourceOperandToStepRoles.end()) return; for (const auto& [stepIndex, type, ioIndex] : it->second) { CHECK_LT(stepIndex, mSteps.size()); const auto* step = mSteps[stepIndex]->executionStep(); callback(step->getPreparedStepModel().get(), type, ioIndex); } } MemoryPreference ExecutionPlan::getMemoryPreference(IOType type, uint32_t index) const { CHECK(mState == SIMPLE || mState == COMPOUND); if (mState == SIMPLE) { return simple()->mPreparedModel->getMemoryPreference(); } else { const auto sourceOperandIndex = type == IOType::INPUT ? getInputSourceOperand(index) : getOutputSourceOperand(index); return compound()->getMemoryPreferenceOfSourceOperand(sourceOperandIndex); } } MemoryPreference ExecutionPlan::CompoundBody::getMemoryPreferenceOfSourceOperand( const SourceOperandIndex& index) const { uint32_t alignment = kMinMemoryAlignment, padding = kMinMemoryPadding; forEachStepRoleOfSourceOperand( index, [&alignment, &padding](const auto* preparedModel, IOType, uint32_t) { const auto preference = preparedModel->getMemoryPreference(); alignment = std::max(alignment, preference.alignment); padding = std::max(padding, preference.padding); }); return {alignment, padding}; } void ExecutionPlan::forEachDynamicTemporary( const std::function& fn) const { if (mState != COMPOUND) { return; } for (const auto& logicalStep : compound()->mSteps) { if (const ExecutionStep* step = logicalStep->tryExecutionStep()) { const uint32_t stepIndex = step->getIndex(); const uint32_t sourceModelIndex = step->getSourceModelIndex(); for (const auto& entry : step->getTempsAsStepModelOutputs()) { const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, entry.first); const auto& sourceOperand = getSourceOperand(sourceOperandIndex); if (hasUnknownSize(sourceOperand)) { fn(sourceOperandIndex, sourceOperand, stepIndex); } } } } } int ModelBuilder::partitionTheWork(const std::vector>& devices, uint32_t preference, uint32_t priority, const OptionalTimePoint& deadline, ExecutionPlan* plan, int simulateFailureResultCode) const { uint32_t sourceModelIndex = plan->getSourceModels().addModel(this); NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority, deadline, plan)); int n = plan->finish(preference, priority, deadline, simulateFailureResultCode); if (VLOG_IS_ON(COMPILATION)) { VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: "; logModelToInfo(makeModel()); plan->dump(); } return n; } int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex, const std::vector>& devices, uint32_t preference, uint32_t priority, const OptionalTimePoint& deadline, ExecutionPlan* plan) const { // This function uses a heuristic approach to partitioning the graph. // It should be good enough for the first release. SourceModels* sourceModels = &plan->getSourceModels(); const size_t deviceCount = devices.size(); const size_t operationCount = mOperations.size(); VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: " << "sourceModelIndex = " << sourceModelIndex << ", " << "deviceCount = " << deviceCount << ", " << "operationCount = " << operationCount; // Figure out where each operation will best execute. // The value of the vector is the index in the devices vector. std::vector bestDeviceForOperation(operationCount); NN_RETURN_IF_ERROR( findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation)); // A special value produced by findBestDeviceForEachOperation meaning that // this is a control flow operation scheduled for interpreted execution // (see LogicalStep). const int kControlFlowInterpreter = deviceCount; // If one device will run all the operations, we don't need to split the // work. This shortcut does not apply when recursively partitioning // referenced models because our plan representation is flat. if (sourceModelIndex == kMainModelInSourceModels && std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(), std::not_equal_to()) == bestDeviceForOperation.end()) { const int bestDeviceIndex = bestDeviceForOperation[0]; // Bypass the partitioning process unless the only operation is a // control flow operation scheduled for interpreted execution. if (bestDeviceIndex != kControlFlowInterpreter) { VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: " << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName(); plan->becomeSingleStep(devices[bestDeviceIndex], this); return ANEURALNETWORKS_NO_ERROR; } } // No easy solution, we need to split the work. // We keep track of the operations that are ready to run for each device. // perDeviceQueue[deviceCount] is for interpreted execution of control flow // (see LogicalStep). std::vector> perDeviceQueue(deviceCount + 1); // This helper function produces a device name. auto deviceName = [&devices, kControlFlowInterpreter, deviceCount](int deviceIndex) -> std::string { if (deviceIndex == kControlFlowInterpreter) { return "NNAPI"; } else if (deviceIndex < 0 || size_t(deviceIndex) >= deviceCount) { return "{unknown}"; } else { return devices.at(deviceIndex)->getName(); } }; // This helper function enqueues the operation on the appropriate queue. auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) { int deviceIndex = bestDeviceForOperation[operationIndex]; perDeviceQueue[deviceIndex].push(operationIndex); VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto " << deviceIndex << " (" << deviceName(deviceIndex) << ")"; }; // This helper function finds a device that has operations ready to process. // We start by looking at the control flow queue, and then look at the // devices in reverse order (i.e., starting at the end of the devices // vector). Earlier devices have a chance to prepare more of the inputs // required by other devices. This function returns -1 if all queues are // empty. auto findNextDeviceToProcess = [&]() -> int { for (int i = perDeviceQueue.size() - 1; i >= 0; i--) { if (!perDeviceQueue[i].empty()) { return i; } } return -1; }; OperandTracker tracker(this, enqueueOnAppropriateDevice); // For each iteration of this loop, we'll create either an execution step or // an interpreted control flow construct (including nested execution steps // and interpreted control flow constructs). while (true) { // Find the device we'll do this step for. int deviceIndex = findNextDeviceToProcess(); VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex << " (" << deviceName(deviceIndex) << ")"; if (deviceIndex < 0) { break; } // Assign as much as possible to this device. auto& queue = perDeviceQueue[deviceIndex]; if (deviceIndex != kControlFlowInterpreter) { ExecutionStep* step = plan->createNewExecutionStep(sourceModelIndex, devices[deviceIndex]); while (!queue.empty()) { uint32_t operationIndex = queue.front(); queue.pop(); int n = step->addOperation(operationIndex); if (n != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "failed to add operation " << operationIndex << " to step"; return n; } tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice); } } else { while (!queue.empty()) { uint32_t operationIndex = queue.front(); queue.pop(); const Operation& operation = getOperation(operationIndex); if (operation.type == OperationType::IF) { namespace op = operation_if; const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]); const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]); const ModelBuilder* thenModel = getReferencedModel(thenOperand); const ModelBuilder* elseModel = getReferencedModel(elseOperand); uint32_t thenModelIndex = sourceModels->addModel(thenModel); uint32_t elseModelIndex = sourceModels->addModel(elseModel); // Emits the following: // Index Step // i if then=(i + 1) else=(j + 1) // ... (then model steps) // j goto k // ... (else model steps) // k (steps after the IF) IfStep* ifStep = plan->createNewIfStep(); ifStep->conditionOperandIndex = SourceOperandIndex( sourceModelIndex, operation.inputs[op::kCondBoolOperand]); ifStep->thenStepIndex = plan->getNextStepIndex(); NN_RETURN_IF_ERROR(thenModel->partitionTheWorkInternal( thenModelIndex, devices, preference, priority, deadline, plan)); GotoStep* afterThenBranch = plan->createNewGotoStep(); ifStep->elseStepIndex = plan->getNextStepIndex(); NN_RETURN_IF_ERROR(elseModel->partitionTheWorkInternal( elseModelIndex, devices, preference, priority, deadline, plan)); afterThenBranch->gotoStepIndex = plan->getNextStepIndex(); // Outer model operands. for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) { ifStep->outerInputOperands.emplace_back(sourceModelIndex, operation.inputs[i]); } for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) { ifStep->outerOutputOperands.emplace_back(sourceModelIndex, operation.outputs[i]); } // Then model operands. for (uint32_t i = 0, n = thenModel->inputCount(); i < n; ++i) { ifStep->thenBranchInputOperands.emplace_back( thenModelIndex, thenModel->getInputOperandIndex(i)); } for (uint32_t i = 0, n = thenModel->outputCount(); i < n; ++i) { ifStep->thenBranchOutputOperands.emplace_back( thenModelIndex, thenModel->getOutputOperandIndex(i)); } // Else model operands. for (uint32_t i = 0, n = elseModel->inputCount(); i < n; ++i) { ifStep->elseBranchInputOperands.emplace_back( elseModelIndex, elseModel->getInputOperandIndex(i)); } for (uint32_t i = 0, n = elseModel->outputCount(); i < n; ++i) { ifStep->elseBranchOutputOperands.emplace_back( elseModelIndex, elseModel->getOutputOperandIndex(i)); } } else if (operation.type == OperationType::WHILE) { namespace op = operation_while; const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]); const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]); const ModelBuilder* condModel = getReferencedModel(condOperand); const ModelBuilder* bodyModel = getReferencedModel(bodyOperand); uint32_t condModelIndex = sourceModels->addModel(condModel); uint32_t bodyModelIndex = sourceModels->addModel(bodyModel); // Emits the following: // Index Step // i while cond=(i + 1) body=(j + 1) exit=(k + 1) // ... (cond model steps) // j goto i // ... (body model steps) // k goto i // ... (steps after the WHILE) // // Note that WhileStep has WhileState associated with it. WhileStep* whileStep = plan->createNewWhileStep(); whileStep->condStepIndex = plan->getNextStepIndex(); NN_RETURN_IF_ERROR(condModel->partitionTheWorkInternal( condModelIndex, devices, preference, priority, deadline, plan)); GotoStep* afterCond = plan->createNewGotoStep(); afterCond->gotoStepIndex = whileStep->index; whileStep->bodyStepIndex = plan->getNextStepIndex(); NN_RETURN_IF_ERROR(bodyModel->partitionTheWorkInternal( bodyModelIndex, devices, preference, priority, deadline, plan)); GotoStep* afterBody = plan->createNewGotoStep(); afterBody->gotoStepIndex = whileStep->index; whileStep->exitStepIndex = plan->getNextStepIndex(); // Outer model operands. for (uint32_t i = op::kFirstInput, n = operation.inputs.size(); i < n; ++i) { whileStep->outerInputOperands.emplace_back(sourceModelIndex, operation.inputs[i]); } for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) { whileStep->outerOutputOperands.emplace_back(sourceModelIndex, operation.outputs[i]); } // Cond model operands. for (uint32_t i = 0, n = condModel->inputCount(); i < n; ++i) { whileStep->condInputOperands.emplace_back( condModelIndex, condModel->getInputOperandIndex(i)); } whileStep->condOutputOperand = SourceOperandIndex(condModelIndex, condModel->getOutputOperandIndex(0)); // Body model operands. for (uint32_t i = 0, n = bodyModel->inputCount(); i < n; ++i) { whileStep->bodyInputOperands.emplace_back( bodyModelIndex, bodyModel->getInputOperandIndex(i)); } for (uint32_t i = 0, n = bodyModel->outputCount(); i < n; ++i) { whileStep->bodyOutputOperands.emplace_back( bodyModelIndex, bodyModel->getOutputOperandIndex(i)); } } else { CHECK(false) << operation.type << " is not a control flow operation"; } tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice); } } } return ANEURALNETWORKS_NO_ERROR; } float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr device) const { // Note that we will call this method multiple times per compilation with // the same arguments if there are nested control flow operations and we // decide to execute the outer operation on the ExecutionPlan::next() // interpreter. // // This is a potential compilation performance problem. To work around it, // the performance value could be cached for the duration of a compilation. float perf = 0; const size_t operationCount = mOperations.size(); for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) { perf += getPerformance(preference, device, operationIndex); } return perf; } float ModelBuilder::getPerformance(uint32_t preference, const std::shared_ptr device, uint32_t operationIndex) const { auto applyPreference = [preference](const Capabilities::PerformanceInfo& perf) { return preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime; }; const Operation& operation = getOperation(operationIndex); if (operation.type == OperationType::IF) { namespace op = operation_if; const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]); const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]); const ModelBuilder* thenModel = getReferencedModel(thenOperand); const ModelBuilder* elseModel = getReferencedModel(elseOperand); return applyPreference(device->getIfPerformance()) + 0.5 * (thenModel->getPerformance(preference, device) + elseModel->getPerformance(preference, device)); } if (operation.type == OperationType::WHILE) { namespace op = operation_while; const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]); const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]); const ModelBuilder* condModel = getReferencedModel(condOperand); const ModelBuilder* bodyModel = getReferencedModel(bodyOperand); return applyPreference(device->getWhilePerformance()) + condModel->getPerformance(preference, device) + bodyModel->getPerformance(preference, device); } // TODO This assumes that the type is dictated by the first operand. This is // currently the case but is not a safe assumption to make in the long term. const uint32_t operandIndex = operation.inputs[0]; const OperandType operandType = mOperands[operandIndex].type; switch (operandType) { case OperandType::FLOAT32: if (mRelaxComputationFloat32toFloat16) { return applyPreference(device->getRelaxedFloat32toFloat16PerformanceScalar()); } break; case OperandType::TENSOR_FLOAT32: if (mRelaxComputationFloat32toFloat16) { return applyPreference(device->getRelaxedFloat32toFloat16PerformanceTensor()); } break; default: break; } return applyPreference(device->getPerformance(operandType)); } bool ModelBuilder::isControlFlowOperationWithOperandOfUnknownSize(uint32_t operationIndex) const { auto containsUnknownSize = [](const ModelBuilder* model, const std::vector& operandIndexes) { for (uint32_t operandIndex : operandIndexes) { if (hasUnknownSize(model->getOperand(operandIndex))) { return true; } } return false; }; const Operation& operation = getOperation(operationIndex); if (operation.type == OperationType::IF) { namespace op = operation_if; const Operand& thenOperand = getOperand(operation.inputs[op::kThenModelOperand]); const Operand& elseOperand = getOperand(operation.inputs[op::kElseModelOperand]); const ModelBuilder* thenModel = getReferencedModel(thenOperand); const ModelBuilder* elseModel = getReferencedModel(elseOperand); return containsUnknownSize(this, operation.inputs) || containsUnknownSize(this, operation.outputs) || containsUnknownSize(thenModel, thenModel->getInputOperandIndexes()) || containsUnknownSize(thenModel, thenModel->getOutputOperandIndexes()) || containsUnknownSize(elseModel, elseModel->getInputOperandIndexes()) || containsUnknownSize(elseModel, elseModel->getOutputOperandIndexes()); } if (operation.type == OperationType::WHILE) { namespace op = operation_while; const Operand& condOperand = getOperand(operation.inputs[op::kCondModelOperand]); const Operand& bodyOperand = getOperand(operation.inputs[op::kBodyModelOperand]); const ModelBuilder* condModel = getReferencedModel(condOperand); const ModelBuilder* bodyModel = getReferencedModel(bodyOperand); return containsUnknownSize(this, operation.inputs) || containsUnknownSize(this, operation.outputs) || containsUnknownSize(condModel, condModel->getInputOperandIndexes()) || containsUnknownSize(condModel, condModel->getOutputOperandIndexes()) || containsUnknownSize(bodyModel, bodyModel->getInputOperandIndexes()) || containsUnknownSize(bodyModel, bodyModel->getOutputOperandIndexes()); } // Not a control flow operation. return false; } bool ModelBuilder::supportedByControlFlowInterpreter(uint32_t operationIndex) const { const Operation& operation = getOperation(operationIndex); return (operation.type == OperationType::IF || operation.type == OperationType::WHILE) && // The partitioner does not support dynamic temporaries (b/132458982). !isControlFlowOperationWithOperandOfUnknownSize(operationIndex); } namespace { // This class determines whether a given device can execute a given operation class CanDo { public: CanDo() {} void initialize(const MetaModel& metaModel, std::shared_ptr device) { mSupportsOperationByIndex = device->getSupportedOperations(metaModel); } bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; } private: std::vector mSupportsOperationByIndex; }; } // anonymous namespace int ModelBuilder::findBestDeviceForEachOperation( uint32_t preference, const std::vector>& devices, std::vector* bestDeviceForOperation) const { const MetaModel metaModel(makeModel(), DeviceManager::get()->strictSlicing()); const size_t deviceCount = devices.size(); std::vector canDo(deviceCount); for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) { canDo[deviceIndex].initialize(metaModel, devices[deviceIndex]); } // Figure out the best driver for each operation. const size_t operationCount = mOperations.size(); for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) { const Operation& operation = getOperation(operationIndex); // Find which device, including CPU fallback, gives the best performance for this operation. int bestChoice = -1; if (isControlFlowOperationWithOperandOfUnknownSize(operationIndex)) { // Do not schedule control flow operations with unknown size to // non-CPU devices because this is not supported by the 1.3 HAL. // See http://b/159076604#comment5. auto cpuDeviceIterator = std::find(devices.begin(), devices.end(), DeviceManager::getCpuDevice()); if (cpuDeviceIterator != devices.end()) { int cpuDeviceIndex = cpuDeviceIterator - devices.begin(); if (canDo[cpuDeviceIndex].check(operationIndex)) { bestChoice = cpuDeviceIndex; } } } else { float bestPerfVal = 0.0; // Do not check bestPerfVal if bestChoice < 0. bool bestIsUpdatable = false; for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) { const auto& device = devices[deviceIndex]; if (canDo[deviceIndex].check(operationIndex)) { const float perfVal = getPerformance(preference, device, operationIndex); const bool isUpdatable = device->isUpdatable(); const bool deviceIsPreferred = (device == DeviceManager::getCpuDevice() || (isUpdatable && !bestIsUpdatable)); if (bestChoice < 0 || perfVal < bestPerfVal || (perfVal == bestPerfVal && deviceIsPreferred)) { bestChoice = deviceIndex; bestPerfVal = perfVal; bestIsUpdatable = isUpdatable; } } else { // Somewhat noisy logging, but only place where the user of NNAPI can get // feedback on why an operation was not run on a specific device. // // Logs O(operationCount * deviceCount) times, but typically deviceCount is // very small. VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation " << operation.type << ":" << operationIndex; } } } if (bestChoice < 0) { LOG(ERROR) << "No driver can do operation " << operation.type; return ANEURALNETWORKS_BAD_DATA; } else if (devices[bestChoice] == DeviceManager::getCpuDevice() && supportedByControlFlowInterpreter(operationIndex)) { // Run control flow on the ExecutionPlan::next() interpreter and try // to delegate referenced models. const int kControlFlowInterpreter = deviceCount; (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter; VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type << ":" << operationIndex << ") = -1 (NNAPI)"; } else { (*bestDeviceForOperation)[operationIndex] = bestChoice; VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << operation.type << ":" << operationIndex << ") = " << bestChoice << " (" << devices[bestChoice]->getName() << ")"; } } return ANEURALNETWORKS_NO_ERROR; } } // namespace nn } // namespace android