// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// output.h: processing the 32-bit accumulators output by the unpack
// stage, obtaining the final result matrix entries and storing them into
// the destination matrix.

#ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
#define GEMMLOWP_INTERNAL_OUTPUT_H_

#include <cmath>
#include <tuple>
#include <type_traits>
#include <typeinfo>

#include "../fixedpoint/fixedpoint.h"
#include "../public/output_stages.h"
#include "simd_wrappers.h"

namespace gemmlowp {

template <typename OutputStage, typename InputBufferType>
struct OutputStageEvalBufferImpl {
  // This generic template body should never be hit.
  static_assert(
      std::is_same<InputBufferType, void>::value,
      "Unimplemented: missing implementation of this output pipeline stage "
      "for this data type. This would happen if some architecture-specific "
      "SIMD back-end (output_$arch.h) were incomplete.");
};

template <typename OutputStage, typename InputType>
struct OutputStageEvalImpl {
  static constexpr int kRows = InputType::kRows;
  static constexpr int kCols = InputType::kCols;
  using InputBufferType = typename InputType::BufferType;
  using BufferEvalImplType =
      OutputStageEvalBufferImpl<OutputStage, InputBufferType>;
  using OutputBufferType = typename BufferEvalImplType::OutputType;
  using OutputScalarType = typename OutputBufferType::ScalarType;
  using OutputType = RegisterBlock<OutputScalarType, kRows, kCols>;

  OutputStageEvalImpl(const OutputStage& s) : buffer_eval_impl(s) {}

  OutputType Eval(InputType input, int, int) const {
    OutputType output;
    output.buf = buffer_eval_impl.Eval(input.buf);
    return output;
  }

  const BufferEvalImplType buffer_eval_impl;
};

template <int Size>
struct OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ToUint8Scale,
                                 RegisterBuffer<std::int32_t, Size>> {
  using InputType = RegisterBuffer<std::int32_t, Size>;
  using OutputType = RegisterBuffer<std::int32_t, Size>;

  typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;

  OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}

  OutputType Eval(InputType input) const {
    const int result_shift = output_stage.result_shift;
    const std::int32_t result_mult_int = output_stage.result_mult_int;
    using RegisterType = typename InputType::RegisterType;
    const RegisterType result_offset =
        Dup<RegisterType>(output_stage.result_offset);
    OutputType output;
    for (int i = 0; i < InputType::kRegisterCount; i++) {
      output.reg[i] = RoundingDivideByPOT(
          Mul(Add(input.reg[i], result_offset), result_mult_int), result_shift);
    }
    return output;
  }

  const OutputStage& output_stage;
};

template <int Rows, int Cols, VectorShape Shape>
struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>,
                           RegisterBlock<std::int32_t, Rows, Cols>> {
  typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
  typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
  typedef OutputStageQuantizeDownInt32ToUint8ScalePC<Shape> OutputStage;

  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

  OutputType Eval(InputType input, int row, int col) const {
    OutputType output;
    const int result_shift = output_stage.result_shift;
    const int pos = Shape == VectorShape::Col ? row : col;
    const auto result_mult_int =
        LoadForBroadcasting<InputType>(output_stage.result_mult_int, pos);
    const auto result_offset =
        LoadForBroadcasting<InputType>(output_stage.result_offset, pos);
    const auto dividend = BroadcastMul<InputType>(
        BroadcastAdd<InputType>(input, result_offset), result_mult_int);
    for (int i = 0; i < InputType::kRegisterCount; i++) {
      output.buf.reg[i] =
          RoundingDivideByPOT(dividend.buf.reg[i], result_shift);
    }
    return output;
  }

  const OutputStage& output_stage;
};

template <int Size>
struct OutputStageEvalBufferImpl<
    OutputStageQuantizeDownInt32ByFixedPoint,
    RegisterBuffer<std::int32_t, Size>> {
  typedef RegisterBuffer<std::int32_t, Size> InputType;
  typedef RegisterBuffer<std::int32_t, Size> OutputType;

  typedef OutputStageQuantizeDownInt32ByFixedPoint OutputStage;

  OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}

  OutputType Eval(InputType input) const {
    OutputType output;
    using RegisterType = typename InputType::RegisterType;
    const RegisterType result_offset_after_shift =
        Dup<RegisterType>(output_stage.result_offset_after_shift);
    for (int i = 0; i < InputType::kRegisterCount; i++) {
      const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul(
          input.reg[i], output_stage.result_fixedpoint_multiplier);
      output.reg[i] =
          Add(RoundingDivideByPOT(mulhigh_val, output_stage.result_shift),
              result_offset_after_shift);
    }
    return output;
  }

  const OutputStage& output_stage;
};

template <int Size>
struct OutputStageEvalBufferImpl<OutputStageScaleInt32ByFixedPointAndExponent,
                                 RegisterBuffer<std::int32_t, Size>> {
  typedef RegisterBuffer<std::int32_t, Size> InputType;
  typedef RegisterBuffer<std::int32_t, Size> OutputType;

  typedef OutputStageScaleInt32ByFixedPointAndExponent OutputStage;

  OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {
    left_shift = std::max(0, output_stage.result_exponent);
    right_shift = std::max(0, -output_stage.result_exponent);
  }

  OutputType Eval(InputType input) const {
    OutputType output;
    using RegisterType = typename InputType::RegisterType;
    const RegisterType result_offset_after_shift =
        Dup<RegisterType>(output_stage.result_offset_after_shift);
    for (int i = 0; i < InputType::kRegisterCount; i++) {
      const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul(
          ShiftLeft(input.reg[i], left_shift),
          output_stage.result_fixedpoint_multiplier);
      output.reg[i] = Add(RoundingDivideByPOT(mulhigh_val, right_shift),
                          result_offset_after_shift);
    }
    return output;
  }

  const OutputStage& output_stage;
  int left_shift;
  int right_shift;
};

template <int Rows, int Cols, VectorShape Shape>
struct OutputStageEvalImpl<
    OutputStageScaleInt32ByFixedPointAndExponentPC<Shape>,
    RegisterBlock<std::int32_t, Rows, Cols>> {
  typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
  typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;

  typedef OutputStageScaleInt32ByFixedPointAndExponentPC<Shape> OutputStage;

  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

  OutputType Eval(InputType input, int row, int col) const {
    OutputType output;
    const int pos = Shape == VectorShape::Row ? col : row;
    using RegisterType = typename InputType::RegisterType;
    const RegisterType result_offset_after_shift =
        Dup<RegisterType>(output_stage.result_offset_after_shift);
    auto left_shift =
        LoadForBroadcasting<InputType>(output_stage.result_exponent, pos);
    auto right_shift =
        LoadForBroadcasting<InputType>(output_stage.result_exponent, pos);
    const auto result_fixedpoint_multiplier = LoadForBroadcasting<InputType>(
        output_stage.result_fixedpoint_multiplier, pos);
    for (int i = 0; i < decltype(left_shift)::kRegisterCount; i++) {
      left_shift.buf.reg[i] = Max(left_shift.buf.reg[i], 0);
      right_shift.buf.reg[i] = Max(-right_shift.buf.reg[i], 0);
    }
    const auto mulhigh_val = BroadcastSaturatingRoundingDoublingHighMul(
        BroadcastShiftLeft(input, left_shift), result_fixedpoint_multiplier);
    const auto rdpot_val =
        BroadcastRoundingDivideByPOT(mulhigh_val, right_shift);
    for (int i = 0; i < InputType::kRegisterCount; i++) {
      output.buf.reg[i] = Add(rdpot_val.buf.reg[i], result_offset_after_shift);
    }
    return output;
  }

  const OutputStage& output_stage;
};

// Implementation of OutputStageSaturatingCastToUint8 for scalar data.
template <int Size>
struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
                                 RegisterBuffer<std::int32_t, Size>> {
  typedef RegisterBuffer<std::int32_t, Size> InputType;
  typedef RegisterBuffer<std::uint8_t, Size> OutputType;
  static_assert(InputType::kRegisterLanes == 1,
                "This path is only for scalar values");

  typedef OutputStageSaturatingCastToUint8 OutputStage;

  OutputStageEvalBufferImpl(const OutputStage&) {}

  OutputType Eval(InputType input) const {
    OutputType output;
    for (int i = 0; i < InputType::kRegisterCount; i++) {
      std::int32_t data = input.reg[i];
      output.reg[i] = data > 255 ? 255 : data < 0 ? 0 : data;
    }
    return output;
  }
};

// Implementation of OutputStageSaturatingCastToInt8 for scalar data.
template <int Size>
struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8,
                                 RegisterBuffer<std::int32_t, Size>> {
  typedef RegisterBuffer<std::int32_t, Size> InputType;
  typedef RegisterBuffer<std::int8_t, Size> OutputType;
  static_assert(InputType::kRegisterLanes == 1,
                "This path is only for scalar values");

  typedef OutputStageSaturatingCastToInt8 OutputStage;

  OutputStageEvalBufferImpl(const OutputStage&) {}

  OutputType Eval(InputType input) const {
    OutputType output;
    for (int i = 0; i < InputType::kRegisterCount; i++) {
      std::int32_t data = input.reg[i];
      output.reg[i] = data > 127 ? 127 : data < -128 ? -128 : data;
    }
    return output;
  }
};

// Implementation of OutputStageSaturatingCastToInt16 for scalar data.
template <int Size>
struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
                                 RegisterBuffer<std::int32_t, Size>> {
  typedef RegisterBuffer<std::int32_t, Size> InputType;
  typedef RegisterBuffer<std::int16_t, Size> OutputType;
  static_assert(InputType::kRegisterLanes == 1,
                "This path is only for scalar values");

  typedef OutputStageSaturatingCastToInt16 OutputStage;

  OutputStageEvalBufferImpl(const OutputStage&) {}

  OutputType Eval(InputType input) const {
    OutputType output;
    for (int i = 0; i < InputType::kRegisterCount; i++) {
      std::int32_t data = input.reg[i];
      output.reg[i] = data > 32767 ? 32767 : data < -32768 ? -32768 : data;
    }
    return output;
  }
};

// Implementation of OutputStageTruncatingCastToUint8 for scalar data
template <int Size>
struct OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8,
                                 RegisterBuffer<std::int32_t, Size>> {
  typedef RegisterBuffer<std::int32_t, Size> InputType;
  typedef RegisterBuffer<std::uint8_t, Size> OutputType;
  static_assert(InputType::kRegisterLanes == 1,
                "This path is only for scalar values");

  typedef OutputStageTruncatingCastToUint8 OutputStage;

  OutputStageEvalBufferImpl(const OutputStage&) {}

  OutputType Eval(InputType input) const {
    OutputType output;
    for (int i = 0; i < InputType::kRegisterCount; i++) {
      output.reg[i] = input.reg[i];
    }
    return output;
  }
};

template <int Rows, int Cols, typename VectorType>
struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
                           RegisterBlock<std::int32_t, Rows, Cols>> {
  typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
  typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
  typedef OutputStageBiasAddition<VectorType> OutputStage;

  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}

  OutputType Eval(InputType input, int row, int col) const {
    const int pos = VectorType::kShape == VectorShape::Row ? col : row;
    return BroadcastAdd<InputType>(
        input, LoadForBroadcasting<InputType>(output_stage.bias_vector, pos));
  }

  const OutputStage& output_stage;
};

template <int Size>
struct OutputStageEvalBufferImpl<OutputStageClamp,
                                 RegisterBuffer<std::int32_t, Size>> {
  typedef RegisterBuffer<std::int32_t, Size> InputType;
  typedef RegisterBuffer<std::int32_t, Size> OutputType;

  typedef OutputStageClamp OutputStage;

  OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}

  OutputType Eval(InputType input) const {
    using RegisterType = typename InputType::RegisterType;
    const RegisterType min = Dup<RegisterType>(output_stage.min);
    const RegisterType max = Dup<RegisterType>(output_stage.max);
    OutputType output;
    for (int i = 0; i < InputType::kRegisterCount; i++) {
      output.reg[i] = Min(Max(input.reg[i], min), max);
    }
    return output;
  }

  const OutputStage& output_stage;
};

template <int Size>
struct OutputStageEvalBufferImpl<OutputStageTanh,
                                 RegisterBuffer<std::int32_t, Size>> {
  typedef RegisterBuffer<std::int32_t, Size> InputType;
  typedef RegisterBuffer<std::int32_t, Size> OutputType;
  using RegisterType = typename InputType::RegisterType;
  typedef RegisterType DataType;
  typedef OutputStageTanh OutputStage;

  OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {
    const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
    const std::int32_t real_amplitude_as_int32 =
        output_stage.real_amplitude_as_int32;

    input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32;
    input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32;
    output_min = real_zero_as_int32 - real_amplitude_as_int32;
    output_max = real_zero_as_int32 + real_amplitude_as_int32;

    double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32;
    inverse_amplitude_neg_exponent = 0;
    while (inverse_amplitude_normalized_double < 0.5) {
      inverse_amplitude_normalized_double *= 2;
      inverse_amplitude_neg_exponent++;
    }
    inverse_amplitude_normalized = FixedPoint<DataType, 0>::FromDouble(
        inverse_amplitude_normalized_double);

    double amplitude_normalized_double = real_amplitude_as_int32;
    amplitude_exponent = 0;
    while (amplitude_normalized_double >= 1.0) {
      amplitude_normalized_double *= 0.5;
      amplitude_exponent++;
    }
    amplitude_normalized =
        FixedPoint<DataType, 0>::FromDouble(amplitude_normalized_double);
  }

  OutputType Eval(InputType input) const {
    const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;

    typedef FixedPoint<DataType, 3> F3;
    typedef FixedPoint<DataType, 0> F0;

    OutputType output;

    for (int i = 0; i < OutputType::kRegisterCount; i++) {
      // fixed-point affine transformation
      DataType input_centered =
          Sub(input.reg[i], Dup<DataType>(real_zero_as_int32));
      F3 fixedpoint_input =
          F3::FromRaw(input_centered) * inverse_amplitude_normalized;
      // left shift
      fixedpoint_input.raw() = ShiftLeft(fixedpoint_input.raw(),
                                         28 - inverse_amplitude_neg_exponent);
      // fixed-point tanh and multiplication
      F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized;
      // right shift
      DataType int32_output =
          Add(Dup<DataType>(real_zero_as_int32),
              ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent));

      DataType mask_if_below_cutoff_min =
          MaskIfLessThanOrEqual(input.reg[i], Dup<DataType>(input_cutoff_min));
      DataType mask_if_above_cutoff_max = MaskIfGreaterThanOrEqual(
          input.reg[i], Dup<DataType>(input_cutoff_max));

      output.reg[i] = SelectUsingMask(
          mask_if_below_cutoff_min, Dup<DataType>(output_min),
          SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max),
                          int32_output));
    }
    return output;
  }

  const OutputStage& output_stage;
  std::int32_t input_cutoff_min, input_cutoff_max;
  std::int32_t output_min, output_max;
  FixedPoint<DataType, 0> inverse_amplitude_normalized;
  int inverse_amplitude_neg_exponent;
  FixedPoint<DataType, 0> amplitude_normalized;
  int amplitude_exponent;
};

// OutputPipelineOutputType is a helper to determine the output data type of a
// pipeline, for a
// given input data type. It is a recursive template; see the explanation on
// OutputPipelineEvalImpl below.
template <typename OutputPipelineType, int FirstStage, typename InputType,
          bool StopRecursion =
              FirstStage == std::tuple_size<OutputPipelineType>::value>
struct OutputPipelineOutputType {
  typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
      FirstStageType;
  typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
      FirstStageOutputType;
  typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1,
                                            FirstStageOutputType>::Type Type;
};

template <typename OutputPipelineType, int FirstStage, typename InputType>
struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType,
                                true> {
  typedef InputType Type;
};

// OutputPipelineEvalImpl is a helper to implement the evaluation of
// the whole pipeline. It is a recursive template to implement compile-time
// unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
// is how we implement recursion: each specialization implements only
// evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
// helper to implement the termination of the recursion as a partial
// specialization below.
template <typename OutputPipelineType, int FirstStage, typename InputType,
          bool StopRecursion =
              FirstStage == std::tuple_size<OutputPipelineType>::value>
struct OutputPipelineEvalImpl {
  typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
      FirstStageType;
  typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
      FirstStageOutputType;
  typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage,
                                            InputType>::Type OutputType;

  OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline)
      : head_impl(std::get<FirstStage>(output_pipeline)),
        tail_impl(output_pipeline) {}

  OutputType Eval(InputType input, int row, int col) const {
    // Evaluate the first stage.
    FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col);
    // Recurse into the remaining stages.
    return tail_impl.Eval(first_stage_output, row, col);
  }

  const OutputStageEvalImpl<FirstStageType, InputType> head_impl;
  const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1,
                               FirstStageOutputType>
      tail_impl;
};

// Specialization on 'StopRecursion' for terminating the recursion.
template <typename OutputPipelineType, int FirstStage, typename InputType>
struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> {
  OutputPipelineEvalImpl(const OutputPipelineType&) {}

  InputType Eval(InputType input, int, int) const {
    // Terminating the recursion.
    return input;
  }
};

template <typename RegisterBlockType, typename DstType>
struct StoreFinalOutputImpl {
  static_assert(std::is_same<RegisterBlockType, void>::value,
                "This generic impl should never be hit");
};

template <typename ScalarType, int Rows, int Cols, typename DstType>
struct StoreFinalOutputImpl<RegisterBlock<ScalarType, Rows, Cols>, DstType> {
  using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>;
  static void Run(const RegisterBlockType& src, DstType* dst, int row,
                  int col) {
    for (int r = 0; r < Rows; r++) {
      for (int c = 0; c < Cols; c++) {
        *dst->data(row + r, col + c) = src.buf.reg[r + c * Rows];
      }
    }
  }
};

// StoreFinalOutput takes the final value at the end of the output pipeline and
// stores it into the destination matrix. It can be specialized for different
// data types; the generic implementation here is typically used only for plain
// old scalar (not SIMD) types.
template <typename RegisterBlockType, typename DstType>
void StoreFinalOutput(RegisterBlockType src, DstType* dst, int row, int col) {
  StoreFinalOutputImpl<RegisterBlockType, DstType>::Run(src, dst, row, col);
}

template <typename OutputPipelineType, typename InputType>
struct OutputPipelineExecutor {
  OutputPipelineExecutor(const OutputPipelineType& output_pipeline)
      : output_pipeline_eval_impl_(output_pipeline) {}

  // Execute is the entry point into the output pipeline evaluation
  // code. It should be the only thing that unpack code calls. It takes the
  // result
  // of the unpack stage and stores it into the destination matrix.
  template <typename DstType>
  void Execute(InputType input, DstType* dst, int src_global_row,
               int src_global_col, int dst_row, int dst_col) const {
    // Statically assert that the output pipeline matches the given destination
    // matrix's scalar type.
    typedef typename OutputPipelineOutputType<
        OutputPipelineType, 0, InputType>::Type::BufferType::ScalarType

        ScalarOutputType;
    typedef typename DstType::Scalar ScalarDstType;
    static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value,
                  "mismatched destination scalar type and output pipeline");

    // Evaluate the output pipeline.
    auto output =
        output_pipeline_eval_impl_.Eval(input, src_global_row, src_global_col);
    // Store the result into the destination matrix.
    StoreFinalOutput(output, dst, dst_row, dst_col);
  }

  const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType>
      output_pipeline_eval_impl_;
};

}  // namespace gemmlowp

#ifdef GEMMLOWP_NEON
#include "output_neon.h"
#elif defined(GEMMLOWP_SSE4)
#include "output_sse.h"
#elif defined(GEMMLOWP_MSA)
#include "output_msa.h"
#endif

#endif  // GEMMLOWP_INTERNAL_OUTPUT_H_