/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H #define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H #include #include #include #include #include #include #include #include #include #include #include #include #include "ExecutionCallback.h" #include "Memory.h" #include "ModelArgumentInfo.h" #include "ModelBuilder.h" #include "NeuralNetworks.h" namespace android { namespace nn { class BurstBuilder; class CompilationBuilder; class Device; class DynamicTemporaries; class ExecutionPlan; class ExecutionStep; class ModelBuilder; class RuntimeMemory; class RuntimePreparedModel; class RuntimeExecution; class StepExecutor; class ExecutionBuilder { friend class StepExecutor; public: explicit ExecutionBuilder(const CompilationBuilder* compilation); virtual ~ExecutionBuilder() = default; int setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer, size_t length); int setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, const RuntimeMemory* memory, size_t offset, size_t length); int setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer, size_t length); int setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, const RuntimeMemory* memory, size_t offset, size_t length); int setMeasureTiming(bool measure); int getDuration(int32_t durationCode, uint64_t* duration) const; int setTimeoutDuration(uint64_t duration); std::optional getTimeoutDuration() const; int setLoopTimeout(uint64_t duration); uint64_t getLoopTimeoutDuration() const { return mLoopTimeoutDuration; } int enableInputAndOutputPadding(bool enable); int setReusable(bool reusable); int computeFenced(const std::vector& wait_for, uint64_t timeoutDurationAfterFence, int* sync_fence); int computeAsynchronously(std::shared_ptr* synchronizationCallback) { CHECK(synchronizationCallback != nullptr); return compute(synchronizationCallback); } int computeSynchronously() { return compute(nullptr); } int burstCompute(BurstBuilder* burst) { return compute(nullptr, burst); } // Initialize output dimensional information from ModelArgumentInfo. std::vector getInitialOutputShapes() const; int getOutputOperandDimensions(uint32_t index, uint32_t* dimensions); int getOutputOperandRank(uint32_t index, uint32_t* rank); // Handshake with lower-level execution support bool measureTiming() const { return mMeasureTiming; } void reportTimingWithoutFencedExecutionCallback(Timing timing) { mTimingWithoutFencedExecutionCallback = timing; } const CompilationBuilder* getCompilation() const { return mCompilation; } const ModelBuilder* getModel() const { return mModel; } const ModelBuilder* getSourceModel(uint32_t index) const; const Operand& getSourceOperand(const std::pair& sourceOperandIndex) const { return getSourceModel(sourceOperandIndex.first)->getOperand(sourceOperandIndex.second); } // This method will be called at the end of all computation paths to change the state // of the execution object and update output shapes / memories. int finishComputation(int result, const std::vector& outputShapes); ErrorStatus finishComputation(ErrorStatus error, const std::vector& outputShapes) { const int result = finishComputation(convertErrorStatusToResultCode(error), outputShapes); return convertResultCodeToErrorStatus(result); } const ExecuteFencedInfoCallback& getExecuteFencedInfoCallback() { return mFencedExecutionCallback; } bool inFlight() const { std::lock_guard lock(mStateMutex); return mState == State::COMPUTATION; } const ModelArgumentInfo& getInputInfo(uint32_t index) const { return mInputs[index]; } const ModelArgumentInfo& getOutputInfo(uint32_t index) const { return mOutputs[index]; } std::optional getRunTimePoolInfo(uint32_t poolIndex) const { return mMemories[poolIndex]->getRunTimePoolInfo(); } protected: // If a callback is provided, then this is asynchronous. If a callback is // not provided (i.e., is nullptr), then this is synchronous. // // If burst is provided, then the burst path will be used. If a burst is not // provided (i.e., is nullptr), then a synchronous execution will occur. // // Providing both synchronizationCallback and burstBuilder is an error. int compute(std::shared_ptr* synchronizationCallback, BurstBuilder* burstBuilder = nullptr); virtual std::tuple, Timing> computeInternal( const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) = 0; virtual std::tuple computeFencedInternal( const std::vector& waitFor, uint64_t timeoutDurationAfterFence, const OptionalTimePoint& deadline) = 0; // This method handles the common preparation and validation logic of compute and computeFenced. // It will be called at the start of every computation. int prepareForCompute(const char* name); const CompilationBuilder* mCompilation; // Update output dimensional information from OutputShape to ModelArgumentInfo. bool updateOutputShapes(ErrorStatus status, const std::vector& outputShapes); bool updateMemories(); const ModelBuilder* mModel; const ExecutionPlan* mPlan; // Whether CPU fallback is allowed based on the value of DeviceManager::kPartitioning* captured // from CompilationBuilder when the ExecutionBuilder is constructed. bool mAllowCpuFallback; // The information we'll send to the driver about the inputs and outputs. // Note that we build this in two steps: // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element. // If set from a pointer, don't set the location in the Request::Argument but store it // instead in mInputBuffers or mOutputBuffers. // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for // the m*Buffers entries. Copy the input values into the shared memory. // We do this to avoid creating a lot of shared memory objects if we have a lot of // parameters specified via pointers. We also avoid copying in the case where // some of the nodes will interpreted on the CPU anyway. std::vector mInputs; std::vector mOutputs; MemoryTracker mMemories; // Do we ask the driver to measure timing? bool mMeasureTiming = false; // Timing reported from the driver. This field is only used if // mFencedExecutionCallback is nullptr. Timing mTimingWithoutFencedExecutionCallback = {}; // Amount of time to complete or abort the execution. std::optional mTimeoutDuration; // Amount of time to complete or abort a loop. uint64_t mLoopTimeoutDuration = operation_while::kTimeoutNsDefault; // The state of the execution. // Properties can only been set when the execution is in the state State::PREPARATION. // Timing and output shapes can only be queried when the execution is in the state // State::COMPLETED. enum class State { PREPARATION, COMPUTATION, COMPLETED }; State mState GUARDED_BY(mStateMutex) = State::PREPARATION; bool computationStarted() const { std::lock_guard lock(mStateMutex); return mState != State::PREPARATION; } bool completed() const { std::lock_guard lock(mStateMutex); return mState == State::COMPLETED; } // Mutex to guard mState. Note that this not strictly needed because we provide // no thread-safety guarantee to the ANeuralNetworksExecution object. mutable std::mutex mStateMutex; // Return false if the execution is in a bad state for starting computation. // Otherwise, return true and set the state to State::COMPUTATION. bool checkAndSetComputationState(const char* name); // With what error status has execution completed? enum class Completion { NO_ERROR, OUTPUT_INSUFFICIENT_SIZE, OTHER_ERROR }; Completion mCompletion = Completion::OTHER_ERROR; Completion completedWith() const { CHECK(completed()); return mCompletion; } // The result code of request validation. // It is only evaluated once at the first time it's needed. std::optional mValidationResultCode; int getValidationResultCode(); // Does every tensor output operand of the model have a fully specified shape? // It is only evaluated once at the first time it's needed. std::optional mOutputsFullySpecified; bool areOutputsFullySpecified(); // The callback used to query execution related info in the case of fenced // execution; otherwise, nullptr. If the execution plan has multiple steps, // this is the callback associated with the last step. If the last step // doesn't support fenced execution (e.g., the driver is too old), or if the // launch of execution on the driver fails, then this callback will be // nullptr. ExecuteFencedInfoCallback mFencedExecutionCallback; // Whether set{Input,Output}[FromMemory] can accept padded length or not. bool mInputAndOutputPaddingEnabled = false; // enableInputAndOutputPadding may only be called before any call of // set{Input,Output}[FromMemory] bool mHasCalledSetInputOutput = false; // Can compute APIs be invoked multiple times on the execution object? bool mReusable = false; }; // For execution plan with a SIMPLE body, i.e. the whole model will be executed on a single device. class SimpleExecutionBuilder : public ExecutionBuilder { public: SimpleExecutionBuilder(const CompilationBuilder* compilation); std::tuple, Timing> computeInternal( const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override; std::tuple computeFencedInternal( const std::vector& waitFor, uint64_t timeoutDurationAfterFence, const OptionalTimePoint& deadline) override; private: std::shared_ptr mExecutor; }; // For execution plan with a COMPOUND body, i.e. partitioned execution with multiple steps. class CompoundExecutionBuilder : public ExecutionBuilder { public: CompoundExecutionBuilder(const CompilationBuilder* compilation); std::tuple, Timing> computeInternal( const OptionalTimePoint& deadline, BurstBuilder* burstBuilder) override; std::tuple computeFencedInternal( const std::vector& waitFor, uint64_t timeoutDurationAfterFence, const OptionalTimePoint& deadline) override; }; // class StepExecutor is used to execute a single "step" in a // potentially multiple step execution process. The graph associated // with that step is executed in its entirety on a single device (or // on the CPU). class StepExecutor { public: // executionBuilder // Describes the full (possibly multiple-"step") execution. // model // The model to be executed by the executor. Possibly a single // "step" model of a multiple-"step" executionBuilder. // driver, preparedModel // The device on which to execute the "step", and the prepared // model to execute on that device. For non-fallback StepExecutor, // neither is nullptr; for fallback StepExecutor, both are ignored in // StepExecutor::computeOnCpuFallback and may be nullptr. // reusable // If true, multiple StepExecutor::compute/computeFenced may be called on this // object; otherwise, only one StepExecutor::compute/computeFenced may be called. // reusable must be false if mDynamicTemporaries != nullptr. // step // Contains the output index mapping from the excerpted "step" model to // main model if the execution has multiple "steps". Must be nullptr // otherwise. // (step == nullptr) == (dynamicTemporaries == nullptr) // dynamicTemporaries // If the execution has multiple "steps", describes the temporaries // of source models that do not have fully specified types and are outputs // of "step" models. Must be nullptr otherwise. // (step == nullptr) == (dynamicTemporaries == nullptr) StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model, std::shared_ptr device, std::shared_ptr preparedModel, bool reusable, const ExecutionStep* step = nullptr, DynamicTemporaries* dynamicTemporaries = nullptr); // Map inputs and outputs from ExecutionBuilder to StepExecutor, // in the case where we have a single-"step" execution (i.e., the executor // is executing the entire model from the ExecutionBuilder). void mapInputsAndOutputsTrivially(); // Update output shapes with shapes returned from execution. struct UpdateOutputShapes { // These fields are meaningless unless updateOutputShapes() returns true bool updatedDynamicTemporary; // did shape (dimensions, size) information change for at // least one dynamic temporary? bool mainOutputInsufficient; // is at least one main model output written by this execution // marked !isSufficient? bool zeroSizedInput; // is at least one output of this execution step a zero-sized tensor // that needs to be read by some other step of the same execution? }; bool updateOutputShapes(int executionResultCode, const std::vector& from, std::vector* to, UpdateOutputShapes* update); // Map inputs and outputs from ExecutionBuilder to StepExecutor, // one at a time. Note that these are input/output indexes, not // operand indexes. // // For mapOutputToInput(), outputDimensions may be nullptr if the input // operand has fully specified dimensions. void mapInput(uint32_t builderIndex, uint32_t executorIndex) { mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]); } void mapOutput(uint32_t builderIndex, uint32_t executorIndex) { mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]); } void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex, const Dimensions* outputDimensions) { mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex], outputDimensions); } // dimensions must either have zero rank or must be // consistent with and at least as well specified as operand dimensions // (i.e., either rank must match, or operand rank must be zero; and for each // individual dimension, either dimension must match, or operand dimension // must be zero). int setInputFromMemory(uint32_t inputIndex, const RuntimeMemory* memory, uint32_t offset, uint32_t length, const Dimensions& dimensions = {}) { return setInputOrOutputFromMemory(mModel->getInputOperand(inputIndex), memory, offset, length, dimensions, &mInputs.at(inputIndex)); } int setOutputFromMemory(uint32_t outputIndex, const RuntimeMemory* memory, uint32_t offset, uint32_t length, const Dimensions& dimensions = {}) { return setInputOrOutputFromMemory(mModel->getOutputOperand(outputIndex), memory, offset, length, dimensions, &mOutputs.at(outputIndex)); } // Executes using the (driver, preparedModel) specified at construction time. std::tuple, Timing> compute( const OptionalTimePoint& deadline, const SharedBurst& burstController = nullptr); // Re-compiles and executes using the CPU, regardless of the (driver, // preparedModel) specified at construction time. std::tuple, Timing> computeOnCpuFallback(); bool isCpu() const; // Perform fenced execution and return error_code, sync_fence_fd and a // callback. std::tuple computeFenced( const std::vector& wait_for, uint64_t timeoutDurationAfterFence, const OptionalTimePoint& deadline); // Do the dynamic temporaries defined by this step have valid allocations? // (true if there are no dynamic temporaries defined by this step.) bool areDynamicTemporariesAllocated() const; private: // builderDimensions may be nullptr if executorInputOrOutput has fully // specified dimensions. void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput, ModelArgumentInfo* executorInputOrOutput, const Dimensions* builderDimensions = nullptr); // dimensions must either have zero rank or // must be consistent with and at least as well specified as operand // dimensions (i.e., either rank must match, or operand rank must be zero; // and for each individual dimension, either dimension must match, or // operand dimension must be zero). int setInputOrOutputFromMemory(const Operand& inputOrOutputOperand, const RuntimeMemory* memory, uint32_t offset, uint32_t length, const Dimensions& dimensions, ModelArgumentInfo* inputOrOutputInfo); // describes the full (possibly multiple-"step") execution ExecutionBuilder* mExecutionBuilder; // describes the single execution step const ExecutionStep* mExecutionStep; // describes the dynamic temporaries DynamicTemporaries* mDynamicTemporaries; // model to be executed on the executor, in both original and // compiled forms; and device on which to execute it const ModelBuilder* mModel; std::shared_ptr mDevice; std::shared_ptr mPreparedModel; // The reusable execution to launch multiple computations. // It is only created once at the first time it's needed. std::shared_ptr mExecution; // Returns {NO_ERROR, execution} on success, or {result_code, nullptr} on failure. std::pair> getReusableExecution(); // The information we'll send to the driver about the inputs and outputs. // Note that we build this in two steps: // 1. As the arguments are specified, set the corresponding mInputs or mOutputs element. // If set from a pointer, don't set the location in the Request::Argument but store it // instead in mInputBuffers or mOutputBuffers. // 2. Once we have all the inputs and outputs, if needed, allocate shared memory for // the m*Buffers entries. Copy the input values into the shared memory. // We do this to avoid creating a lot of shared memory objects if we have a lot of // parameters specified via pointers. We also avoid copying in the case where // some of the nodes will interpreted on the CPU anyway. std::vector mInputs; std::vector mOutputs; MemoryTracker mMemories; // Whether compute/computeFenced may be invoked multiple times. bool mReusable = false; }; std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes); } // namespace nn } // namespace android #endif // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H