You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
461 lines
20 KiB
461 lines
20 KiB
/*
|
|
* Copyright (C) 2017 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H
|
|
#define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H
|
|
|
|
#include <ControlFlow.h>
|
|
#include <CpuExecutor.h>
|
|
#include <android-base/thread_annotations.h>
|
|
#include <nnapi/IBurst.h>
|
|
#include <nnapi/IPreparedModel.h>
|
|
#include <nnapi/Types.h>
|
|
#include <nnapi/Validation.h>
|
|
|
|
#include <memory>
|
|
#include <string>
|
|
#include <tuple>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "ExecutionCallback.h"
|
|
#include "Memory.h"
|
|
#include "ModelArgumentInfo.h"
|
|
#include "ModelBuilder.h"
|
|
#include "NeuralNetworks.h"
|
|
|
|
namespace android {
|
|
namespace nn {
|
|
|
|
class BurstBuilder;
|
|
class CompilationBuilder;
|
|
class Device;
|
|
class DynamicTemporaries;
|
|
class ExecutionPlan;
|
|
class ExecutionStep;
|
|
class ModelBuilder;
|
|
class RuntimeMemory;
|
|
class RuntimePreparedModel;
|
|
class RuntimeExecution;
|
|
class StepExecutor;
|
|
|
|
class ExecutionBuilder {
|
|
friend class StepExecutor;
|
|
|
|
public:
|
|
explicit ExecutionBuilder(const CompilationBuilder* compilation);
|
|
virtual ~ExecutionBuilder() = default;
|
|
|
|
int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer,
|
|
size_t length);
|
|
int setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
|
|
const RuntimeMemory* memory, size_t offset, size_t length);
|
|
int setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer,
|
|
size_t length);
|
|
int setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type,
|
|
const RuntimeMemory* memory, size_t offset, size_t length);
|
|
|
|
int setMeasureTiming(bool measure);
|
|
|
|
int getDuration(int32_t durationCode, uint64_t* duration) const;
|
|
|
|
int setTimeoutDuration(uint64_t duration);
|
|
|
|
std::optional<uint64_t> getTimeoutDuration() const;
|
|
|
|
int setLoopTimeout(uint64_t duration);
|
|
|
|
uint64_t getLoopTimeoutDuration() const { return mLoopTimeoutDuration; }
|
|
|
|
int enableInputAndOutputPadding(bool enable);
|
|
|
|
int setReusable(bool reusable);
|
|
|
|
int computeFenced(const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
|
|
int* sync_fence);
|
|
|
|
int computeAsynchronously(std::shared_ptr<ExecutionCallback>* synchronizationCallback) {
|
|
CHECK(synchronizationCallback != nullptr);
|
|
return compute(synchronizationCallback);
|
|
}
|
|
int computeSynchronously() { return compute(nullptr); }
|
|
int burstCompute(BurstBuilder* burst) { return compute(nullptr, burst); }
|
|
|
|
// Initialize output dimensional information from ModelArgumentInfo.
|
|
std::vector<OutputShape> getInitialOutputShapes() const;
|
|
|
|
int getOutputOperandDimensions(uint32_t index, uint32_t* dimensions);
|
|
int getOutputOperandRank(uint32_t index, uint32_t* rank);
|
|
|
|
// Handshake with lower-level execution support
|
|
bool measureTiming() const { return mMeasureTiming; }
|
|
void reportTimingWithoutFencedExecutionCallback(Timing timing) {
|
|
mTimingWithoutFencedExecutionCallback = timing;
|
|
}
|
|
|
|
const CompilationBuilder* getCompilation() const { return mCompilation; }
|
|
const ModelBuilder* getModel() const { return mModel; }
|
|
const ModelBuilder* getSourceModel(uint32_t index) const;
|
|
const Operand& getSourceOperand(const std::pair<uint32_t, uint32_t>& sourceOperandIndex) const {
|
|
return getSourceModel(sourceOperandIndex.first)->getOperand(sourceOperandIndex.second);
|
|
}
|
|
|
|
// This method will be called at the end of all computation paths to change the state
|
|
// of the execution object and update output shapes / memories.
|
|
int finishComputation(int result, const std::vector<OutputShape>& outputShapes);
|
|
ErrorStatus finishComputation(ErrorStatus error, const std::vector<OutputShape>& outputShapes) {
|
|
const int result = finishComputation(convertErrorStatusToResultCode(error), outputShapes);
|
|
return convertResultCodeToErrorStatus(result);
|
|
}
|
|
|
|
const ExecuteFencedInfoCallback& getExecuteFencedInfoCallback() {
|
|
return mFencedExecutionCallback;
|
|
}
|
|
|
|
bool inFlight() const {
|
|
std::lock_guard<std::mutex> lock(mStateMutex);
|
|
return mState == State::COMPUTATION;
|
|
}
|
|
|
|
const ModelArgumentInfo& getInputInfo(uint32_t index) const { return mInputs[index]; }
|
|
const ModelArgumentInfo& getOutputInfo(uint32_t index) const { return mOutputs[index]; }
|
|
|
|
std::optional<RunTimePoolInfo> getRunTimePoolInfo(uint32_t poolIndex) const {
|
|
return mMemories[poolIndex]->getRunTimePoolInfo();
|
|
}
|
|
|
|
protected:
|
|
// If a callback is provided, then this is asynchronous. If a callback is
|
|
// not provided (i.e., is nullptr), then this is synchronous.
|
|
//
|
|
// If burst is provided, then the burst path will be used. If a burst is not
|
|
// provided (i.e., is nullptr), then a synchronous execution will occur.
|
|
//
|
|
// Providing both synchronizationCallback and burstBuilder is an error.
|
|
int compute(std::shared_ptr<ExecutionCallback>* synchronizationCallback,
|
|
BurstBuilder* burstBuilder = nullptr);
|
|
|
|
virtual std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
|
|
const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) = 0;
|
|
|
|
virtual std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
|
|
const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
|
|
const OptionalTimePoint& deadline) = 0;
|
|
|
|
// This method handles the common preparation and validation logic of compute and computeFenced.
|
|
// It will be called at the start of every computation.
|
|
int prepareForCompute(const char* name);
|
|
|
|
const CompilationBuilder* mCompilation;
|
|
|
|
// Update output dimensional information from OutputShape to ModelArgumentInfo.
|
|
bool updateOutputShapes(ErrorStatus status, const std::vector<OutputShape>& outputShapes);
|
|
|
|
bool updateMemories();
|
|
|
|
const ModelBuilder* mModel;
|
|
const ExecutionPlan* mPlan;
|
|
|
|
// Whether CPU fallback is allowed based on the value of DeviceManager::kPartitioning* captured
|
|
// from CompilationBuilder when the ExecutionBuilder is constructed.
|
|
bool mAllowCpuFallback;
|
|
|
|
// The information we'll send to the driver about the inputs and outputs.
|
|
// Note that we build this in two steps:
|
|
// 1. As the arguments are specified, set the corresponding mInputs or mOutputs element.
|
|
// If set from a pointer, don't set the location in the Request::Argument but store it
|
|
// instead in mInputBuffers or mOutputBuffers.
|
|
// 2. Once we have all the inputs and outputs, if needed, allocate shared memory for
|
|
// the m*Buffers entries. Copy the input values into the shared memory.
|
|
// We do this to avoid creating a lot of shared memory objects if we have a lot of
|
|
// parameters specified via pointers. We also avoid copying in the case where
|
|
// some of the nodes will interpreted on the CPU anyway.
|
|
std::vector<ModelArgumentInfo> mInputs;
|
|
std::vector<ModelArgumentInfo> mOutputs;
|
|
MemoryTracker mMemories;
|
|
|
|
// Do we ask the driver to measure timing?
|
|
bool mMeasureTiming = false;
|
|
|
|
// Timing reported from the driver. This field is only used if
|
|
// mFencedExecutionCallback is nullptr.
|
|
Timing mTimingWithoutFencedExecutionCallback = {};
|
|
|
|
// Amount of time to complete or abort the execution.
|
|
std::optional<uint64_t> mTimeoutDuration;
|
|
|
|
// Amount of time to complete or abort a loop.
|
|
uint64_t mLoopTimeoutDuration = operation_while::kTimeoutNsDefault;
|
|
|
|
// The state of the execution.
|
|
// Properties can only been set when the execution is in the state State::PREPARATION.
|
|
// Timing and output shapes can only be queried when the execution is in the state
|
|
// State::COMPLETED.
|
|
enum class State { PREPARATION, COMPUTATION, COMPLETED };
|
|
State mState GUARDED_BY(mStateMutex) = State::PREPARATION;
|
|
bool computationStarted() const {
|
|
std::lock_guard<std::mutex> lock(mStateMutex);
|
|
return mState != State::PREPARATION;
|
|
}
|
|
bool completed() const {
|
|
std::lock_guard<std::mutex> lock(mStateMutex);
|
|
return mState == State::COMPLETED;
|
|
}
|
|
|
|
// Mutex to guard mState. Note that this not strictly needed because we provide
|
|
// no thread-safety guarantee to the ANeuralNetworksExecution object.
|
|
mutable std::mutex mStateMutex;
|
|
|
|
// Return false if the execution is in a bad state for starting computation.
|
|
// Otherwise, return true and set the state to State::COMPUTATION.
|
|
bool checkAndSetComputationState(const char* name);
|
|
|
|
// With what error status has execution completed?
|
|
enum class Completion { NO_ERROR, OUTPUT_INSUFFICIENT_SIZE, OTHER_ERROR };
|
|
Completion mCompletion = Completion::OTHER_ERROR;
|
|
Completion completedWith() const {
|
|
CHECK(completed());
|
|
return mCompletion;
|
|
}
|
|
|
|
// The result code of request validation.
|
|
// It is only evaluated once at the first time it's needed.
|
|
std::optional<int> mValidationResultCode;
|
|
int getValidationResultCode();
|
|
|
|
// Does every tensor output operand of the model have a fully specified shape?
|
|
// It is only evaluated once at the first time it's needed.
|
|
std::optional<bool> mOutputsFullySpecified;
|
|
bool areOutputsFullySpecified();
|
|
|
|
// The callback used to query execution related info in the case of fenced
|
|
// execution; otherwise, nullptr. If the execution plan has multiple steps,
|
|
// this is the callback associated with the last step. If the last step
|
|
// doesn't support fenced execution (e.g., the driver is too old), or if the
|
|
// launch of execution on the driver fails, then this callback will be
|
|
// nullptr.
|
|
ExecuteFencedInfoCallback mFencedExecutionCallback;
|
|
|
|
// Whether set{Input,Output}[FromMemory] can accept padded length or not.
|
|
bool mInputAndOutputPaddingEnabled = false;
|
|
|
|
// enableInputAndOutputPadding may only be called before any call of
|
|
// set{Input,Output}[FromMemory]
|
|
bool mHasCalledSetInputOutput = false;
|
|
|
|
// Can compute APIs be invoked multiple times on the execution object?
|
|
bool mReusable = false;
|
|
};
|
|
|
|
// For execution plan with a SIMPLE body, i.e. the whole model will be executed on a single device.
|
|
class SimpleExecutionBuilder : public ExecutionBuilder {
|
|
public:
|
|
SimpleExecutionBuilder(const CompilationBuilder* compilation);
|
|
|
|
std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
|
|
const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;
|
|
|
|
std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
|
|
const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
|
|
const OptionalTimePoint& deadline) override;
|
|
|
|
private:
|
|
std::shared_ptr<StepExecutor> mExecutor;
|
|
};
|
|
|
|
// For execution plan with a COMPOUND body, i.e. partitioned execution with multiple steps.
|
|
class CompoundExecutionBuilder : public ExecutionBuilder {
|
|
public:
|
|
CompoundExecutionBuilder(const CompilationBuilder* compilation);
|
|
|
|
std::tuple<int, std::vector<OutputShape>, Timing> computeInternal(
|
|
const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override;
|
|
|
|
std::tuple<int, int, ExecuteFencedInfoCallback> computeFencedInternal(
|
|
const std::vector<int>& waitFor, uint64_t timeoutDurationAfterFence,
|
|
const OptionalTimePoint& deadline) override;
|
|
};
|
|
|
|
// class StepExecutor is used to execute a single "step" in a
|
|
// potentially multiple step execution process. The graph associated
|
|
// with that step is executed in its entirety on a single device (or
|
|
// on the CPU).
|
|
class StepExecutor {
|
|
public:
|
|
// executionBuilder
|
|
// Describes the full (possibly multiple-"step") execution.
|
|
// model
|
|
// The model to be executed by the executor. Possibly a single
|
|
// "step" model of a multiple-"step" executionBuilder.
|
|
// driver, preparedModel
|
|
// The device on which to execute the "step", and the prepared
|
|
// model to execute on that device. For non-fallback StepExecutor,
|
|
// neither is nullptr; for fallback StepExecutor, both are ignored in
|
|
// StepExecutor::computeOnCpuFallback and may be nullptr.
|
|
// reusable
|
|
// If true, multiple StepExecutor::compute/computeFenced may be called on this
|
|
// object; otherwise, only one StepExecutor::compute/computeFenced may be called.
|
|
// reusable must be false if mDynamicTemporaries != nullptr.
|
|
// step
|
|
// Contains the output index mapping from the excerpted "step" model to
|
|
// main model if the execution has multiple "steps". Must be nullptr
|
|
// otherwise.
|
|
// (step == nullptr) == (dynamicTemporaries == nullptr)
|
|
// dynamicTemporaries
|
|
// If the execution has multiple "steps", describes the temporaries
|
|
// of source models that do not have fully specified types and are outputs
|
|
// of "step" models. Must be nullptr otherwise.
|
|
// (step == nullptr) == (dynamicTemporaries == nullptr)
|
|
StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
|
|
std::shared_ptr<Device> device,
|
|
std::shared_ptr<RuntimePreparedModel> preparedModel, bool reusable,
|
|
const ExecutionStep* step = nullptr,
|
|
DynamicTemporaries* dynamicTemporaries = nullptr);
|
|
|
|
// Map inputs and outputs from ExecutionBuilder to StepExecutor,
|
|
// in the case where we have a single-"step" execution (i.e., the executor
|
|
// is executing the entire model from the ExecutionBuilder).
|
|
void mapInputsAndOutputsTrivially();
|
|
|
|
// Update output shapes with shapes returned from execution.
|
|
struct UpdateOutputShapes {
|
|
// These fields are meaningless unless updateOutputShapes() returns true
|
|
bool updatedDynamicTemporary; // did shape (dimensions, size) information change for at
|
|
// least one dynamic temporary?
|
|
bool mainOutputInsufficient; // is at least one main model output written by this execution
|
|
// marked !isSufficient?
|
|
bool zeroSizedInput; // is at least one output of this execution step a zero-sized tensor
|
|
// that needs to be read by some other step of the same execution?
|
|
};
|
|
bool updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
|
|
std::vector<OutputShape>* to, UpdateOutputShapes* update);
|
|
|
|
// Map inputs and outputs from ExecutionBuilder to StepExecutor,
|
|
// one at a time. Note that these are input/output indexes, not
|
|
// operand indexes.
|
|
//
|
|
// For mapOutputToInput(), outputDimensions may be nullptr if the input
|
|
// operand has fully specified dimensions.
|
|
void mapInput(uint32_t builderIndex, uint32_t executorIndex) {
|
|
mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]);
|
|
}
|
|
void mapOutput(uint32_t builderIndex, uint32_t executorIndex) {
|
|
mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]);
|
|
}
|
|
void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex,
|
|
const Dimensions* outputDimensions) {
|
|
mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex],
|
|
outputDimensions);
|
|
}
|
|
|
|
// dimensions must either have zero rank or must be
|
|
// consistent with and at least as well specified as operand dimensions
|
|
// (i.e., either rank must match, or operand rank must be zero; and for each
|
|
// individual dimension, either dimension must match, or operand dimension
|
|
// must be zero).
|
|
int setInputFromMemory(uint32_t inputIndex, const RuntimeMemory* memory, uint32_t offset,
|
|
uint32_t length, const Dimensions& dimensions = {}) {
|
|
return setInputOrOutputFromMemory(mModel->getInputOperand(inputIndex), memory, offset,
|
|
length, dimensions, &mInputs.at(inputIndex));
|
|
}
|
|
int setOutputFromMemory(uint32_t outputIndex, const RuntimeMemory* memory, uint32_t offset,
|
|
uint32_t length, const Dimensions& dimensions = {}) {
|
|
return setInputOrOutputFromMemory(mModel->getOutputOperand(outputIndex), memory, offset,
|
|
length, dimensions, &mOutputs.at(outputIndex));
|
|
}
|
|
|
|
// Executes using the (driver, preparedModel) specified at construction time.
|
|
std::tuple<int, std::vector<OutputShape>, Timing> compute(
|
|
const OptionalTimePoint& deadline, const SharedBurst& burstController = nullptr);
|
|
|
|
// Re-compiles and executes using the CPU, regardless of the (driver,
|
|
// preparedModel) specified at construction time.
|
|
std::tuple<int, std::vector<OutputShape>, Timing> computeOnCpuFallback();
|
|
|
|
bool isCpu() const;
|
|
|
|
// Perform fenced execution and return error_code, sync_fence_fd and a
|
|
// callback.
|
|
std::tuple<int, int, ExecuteFencedInfoCallback> computeFenced(
|
|
const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
|
|
const OptionalTimePoint& deadline);
|
|
|
|
// Do the dynamic temporaries defined by this step have valid allocations?
|
|
// (true if there are no dynamic temporaries defined by this step.)
|
|
bool areDynamicTemporariesAllocated() const;
|
|
|
|
private:
|
|
// builderDimensions may be nullptr if executorInputOrOutput has fully
|
|
// specified dimensions.
|
|
void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
|
|
ModelArgumentInfo* executorInputOrOutput,
|
|
const Dimensions* builderDimensions = nullptr);
|
|
|
|
// dimensions must either have zero rank or
|
|
// must be consistent with and at least as well specified as operand
|
|
// dimensions (i.e., either rank must match, or operand rank must be zero;
|
|
// and for each individual dimension, either dimension must match, or
|
|
// operand dimension must be zero).
|
|
int setInputOrOutputFromMemory(const Operand& inputOrOutputOperand, const RuntimeMemory* memory,
|
|
uint32_t offset, uint32_t length, const Dimensions& dimensions,
|
|
ModelArgumentInfo* inputOrOutputInfo);
|
|
|
|
// describes the full (possibly multiple-"step") execution
|
|
ExecutionBuilder* mExecutionBuilder;
|
|
|
|
// describes the single execution step
|
|
const ExecutionStep* mExecutionStep;
|
|
|
|
// describes the dynamic temporaries
|
|
DynamicTemporaries* mDynamicTemporaries;
|
|
|
|
// model to be executed on the executor, in both original and
|
|
// compiled forms; and device on which to execute it
|
|
const ModelBuilder* mModel;
|
|
std::shared_ptr<Device> mDevice;
|
|
std::shared_ptr<RuntimePreparedModel> mPreparedModel;
|
|
|
|
// The reusable execution to launch multiple computations.
|
|
// It is only created once at the first time it's needed.
|
|
std::shared_ptr<RuntimeExecution> mExecution;
|
|
// Returns {NO_ERROR, execution} on success, or {result_code, nullptr} on failure.
|
|
std::pair<int, std::shared_ptr<RuntimeExecution>> getReusableExecution();
|
|
|
|
// The information we'll send to the driver about the inputs and outputs.
|
|
// Note that we build this in two steps:
|
|
// 1. As the arguments are specified, set the corresponding mInputs or mOutputs element.
|
|
// If set from a pointer, don't set the location in the Request::Argument but store it
|
|
// instead in mInputBuffers or mOutputBuffers.
|
|
// 2. Once we have all the inputs and outputs, if needed, allocate shared memory for
|
|
// the m*Buffers entries. Copy the input values into the shared memory.
|
|
// We do this to avoid creating a lot of shared memory objects if we have a lot of
|
|
// parameters specified via pointers. We also avoid copying in the case where
|
|
// some of the nodes will interpreted on the CPU anyway.
|
|
std::vector<ModelArgumentInfo> mInputs;
|
|
std::vector<ModelArgumentInfo> mOutputs;
|
|
MemoryTracker mMemories;
|
|
|
|
// Whether compute/computeFenced may be invoked multiple times.
|
|
bool mReusable = false;
|
|
};
|
|
|
|
std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes);
|
|
|
|
} // namespace nn
|
|
} // namespace android
|
|
|
|
#endif // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H
|