v811_spc009/test/mlts/benchmark/jni/run_tflite.h

/**
 * Copyright 2017 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef COM_EXAMPLE_ANDROID_NN_BENCHMARK_RUN_TFLITE_H
#define COM_EXAMPLE_ANDROID_NN_BENCHMARK_RUN_TFLITE_H

#include "tensorflow/lite/delegates/gpu/delegate.h"
#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
#include "tensorflow/lite/interpreter.h"
#include "tensorflow/lite/model.h"

#include <memory>
#include <unistd.h>
#include <vector>

struct InferenceOutput {
  uint8_t* ptr;
  size_t size;
};

// Inputs and expected outputs for inference
struct InferenceInOut {
  // Input can either be directly specified as a pointer or indirectly with
  // the createInput callback. This is needed for large datasets where
  // allocating memory for all inputs at once is not feasible.
  uint8_t* input;
  size_t input_size;

  std::vector<InferenceOutput> outputs;
  std::function<bool(uint8_t*, size_t)> createInput;
};

// Inputs and expected outputs for an inference sequence.
using InferenceInOutSequence = std::vector<InferenceInOut>;

// Result of a single inference
struct InferenceResult {
  float computeTimeSec;
  // MSE for each output
  std::vector<float> meanSquareErrors;
  // Max single error for each output
  std::vector<float> maxSingleErrors;
  // Outputs
  std::vector<std::vector<uint8_t>> inferenceOutputs;
  int inputOutputSequenceIndex;
  int inputOutputIndex;
};

struct CompilationBenchmarkResult {
  std::vector<float> compileWithoutCacheTimeSec;
  // The following optional fields have no value if compilation caching is not supported.
  std::optional<std::vector<float>> saveToCacheTimeSec;
  std::optional<std::vector<float>> prepareFromCacheTimeSec;
  // The total size of cache files. It is zero if compilation caching is not supported.
  int cacheSizeBytes = 0;
};

/** Discard inference output in inference results. */
const int FLAG_DISCARD_INFERENCE_OUTPUT = 1 << 0;
/** Do not expect golden output for inference inputs. */
const int FLAG_IGNORE_GOLDEN_OUTPUT = 1 << 1;
/** Collect only 1 benchmark result every INFERENCE_OUT_SAMPLE_RATE **/
const int FLAG_SAMPLE_BENCHMARK_RESULTS = 1 << 2;

const int INFERENCE_OUT_SAMPLE_RATE = 10;

enum class CompilationBenchmarkType {
  // Benchmark without cache
  WITHOUT_CACHE,
  // Benchmark cache miss
  SAVE_TO_CACHE,
  // Benchmark cache hit
  PREPARE_FROM_CACHE,
};

/** TFLite backend. */
constexpr int TFLITE_CPU = 0;
constexpr int TFLITE_NNAPI = 1;
constexpr int TFLITE_GPU = 2;

class BenchmarkModel {
 public:
  ~BenchmarkModel();

  static BenchmarkModel* create(const char* modelfile, int tfliteBackend,
                                bool enable_intermediate_tensors_dump,
                                int* nnapiErrno, const char* nnapi_device_name,
                                bool mmapModel, const char* nnapi_cache_dir);

  bool resizeInputTensors(std::vector<int> shape);
  bool setInput(const uint8_t* dataPtr, size_t length);
  bool runInference();
  // Resets TFLite states (RNN/LSTM states etc).
  bool resetStates();

  bool benchmark(const std::vector<InferenceInOutSequence>& inOutData,
                 int seqInferencesMaxCount, float timeout, int flags,
                 std::vector<InferenceResult>* result);

  bool benchmarkCompilation(int maxNumIterations, float warmupTimeout, float runTimeout,
                            CompilationBenchmarkResult* result);

  bool dumpAllLayers(const char* path,
                     const std::vector<InferenceInOutSequence>& inOutData);

 private:
  BenchmarkModel() = default;
  bool init(const char* modelfile, int tfliteBackend,
            bool enable_intermediate_tensors_dump,
            int* nnapiErrno, const char* nnapi_device_name,
            /* flag to choose between memory mapping the model and initializing
                the model from programs memory*/
            bool mmapModel,
            const char* nnapi_cache_dir);

  void getOutputError(const uint8_t* dataPtr, size_t length,
                      InferenceResult* result, int output_index);
  void saveInferenceOutput(InferenceResult* result, int output_index);

  bool runCompilation(const char* cacheDir);
  bool benchmarkSingleTypeOfCompilation(CompilationBenchmarkType type, int maxNumIterations,
                                        float timeout, std::vector<float>* results);
  bool benchmarkSingleTypeOfCompilationWithWarmup(CompilationBenchmarkType type,
                                                  int maxNumIterations, float warmupTimeout,
                                                  float runTimeout, std::vector<float>* results);
  bool getCompilationCacheSize(int* cacheSizeBytes);

  std::string mModelBuffer;
  std::unique_ptr<tflite::FlatBufferModel> mTfliteModel;
  std::unique_ptr<tflite::Interpreter> mTfliteInterpreter;
  std::unique_ptr<tflite::StatefulNnApiDelegate> mTfliteNnapiDelegate;
  // Store indices of output tensors, used to dump intermediate tensors
  std::vector<int> outputs;

  // Parameters for compilation
  std::string mModelFile;
  std::optional<std::string> mCacheDir;
  std::string mNnApiDeviceName;
#if defined(NN_BENCHMARK_ENABLE_GPU)
  TfLiteDelegate* mGpuDelegate;
#endif  // defined(NN_BENCHMARK_ENABLE_GPU)
  int mTfliteBackend;
};

#endif  // COM_EXAMPLE_ANDROID_NN_BENCHMARK_RUN_TFLITE_H