You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
793 lines
32 KiB
793 lines
32 KiB
/******************************************************************************
|
|
*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at:
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
*****************************************************************************
|
|
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
*/
|
|
/**
|
|
*******************************************************************************
|
|
* @file
|
|
* ihevce_hme_utils_neon.c
|
|
*
|
|
* @brief
|
|
* Contains function definitions for hme utils function in neon intrinsic
|
|
*
|
|
*
|
|
* @author
|
|
* ittian
|
|
*
|
|
* @par List of Functions:
|
|
* - ihevce_get_wt_inp_8x8_neon()
|
|
* - ihevce_get_wt_inp_ctb_neon()
|
|
* @remarks
|
|
* None
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
/*****************************************************************************/
|
|
/* File Includes */
|
|
/*****************************************************************************/
|
|
/* System include files */
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include <arm_neon.h>
|
|
|
|
/* User include files */
|
|
#include "ihevc_typedefs.h"
|
|
#include "itt_video_api.h"
|
|
#include "ihevc_cmn_utils_neon.h"
|
|
#include "ihevc_chroma_itrans_recon.h"
|
|
#include "ihevc_chroma_intra_pred.h"
|
|
#include "ihevc_debug.h"
|
|
#include "ihevc_deblk.h"
|
|
#include "ihevc_defs.h"
|
|
#include "ihevc_itrans_recon.h"
|
|
#include "ihevc_intra_pred.h"
|
|
#include "ihevc_inter_pred.h"
|
|
#include "ihevc_macros.h"
|
|
#include "ihevc_mem_fns.h"
|
|
#include "ihevc_padding.h"
|
|
#include "ihevc_quant_iquant_ssd.h"
|
|
#include "ihevc_resi_trans.h"
|
|
#include "ihevc_sao.h"
|
|
#include "ihevc_structs.h"
|
|
#include "ihevc_weighted_pred.h"
|
|
|
|
#include "rc_cntrl_param.h"
|
|
#include "rc_frame_info_collector.h"
|
|
#include "rc_look_ahead_params.h"
|
|
|
|
#include "ihevce_api.h"
|
|
#include "ihevce_defs.h"
|
|
#include "ihevce_lap_enc_structs.h"
|
|
#include "ihevce_multi_thrd_structs.h"
|
|
#include "ihevce_function_selector.h"
|
|
#include "ihevce_me_common_defs.h"
|
|
#include "ihevce_enc_structs.h"
|
|
#include "ihevce_had_satd.h"
|
|
#include "ihevce_ipe_instr_set_router.h"
|
|
#include "ihevce_global_tables.h"
|
|
|
|
#include "hme_datatype.h"
|
|
#include "hme_interface.h"
|
|
#include "hme_common_defs.h"
|
|
#include "hme_defs.h"
|
|
#include "ihevce_me_instr_set_router.h"
|
|
#include "hme_globals.h"
|
|
#include "hme_utils.h"
|
|
#include "hme_coarse.h"
|
|
#include "hme_refine.h"
|
|
|
|
/*****************************************************************************/
|
|
/* Constant Macros */
|
|
/*****************************************************************************/
|
|
#define IHEVCE_WT_PRED_SHIFT 15
|
|
|
|
/*****************************************************************************/
|
|
/* Function Definitions */
|
|
/*****************************************************************************/
|
|
|
|
static INLINE void ihevce_get_wt_inp_4x8_neon(
|
|
const UWORD8 *pu1_src,
|
|
UWORD8 *pu1_dst,
|
|
wgt_pred_ctxt_t *ps_wt_inp_prms,
|
|
WORD32 u1_num_valid_refs,
|
|
WORD32 *ai4_wt_refs,
|
|
WORD32 src_stride,
|
|
WORD32 dst_stride)
|
|
{
|
|
S32 inv_wt;
|
|
S16 off;
|
|
uint8x8_t src0_8x8b, src1_8x8b, src2_8x8b, src3_8x8b;
|
|
int16x8_t src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
|
|
int16x8_t src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b, off_8x16b;
|
|
int32x4_t dst0_4x32b, dst1_4x32b, dst2_4x32b, dst3_4x32b;
|
|
int32x4_t dst4_4x32b, dst5_4x32b, dst6_4x32b, dst7_4x32b;
|
|
int32x4_t add_4x32b, inv_wt_4x32b;
|
|
U08 ref;
|
|
int32x4_t log_wdc = vdupq_n_s32(ps_wt_inp_prms->wpred_log_wdc);
|
|
|
|
src0_8x8b = vld1_u8((pu1_src + 0 * src_stride));
|
|
src1_8x8b = vld1_u8((pu1_src + 1 * src_stride));
|
|
src2_8x8b = vld1_u8((pu1_src + 2 * src_stride));
|
|
src3_8x8b = vld1_u8((pu1_src + 3 * src_stride));
|
|
/* Store */
|
|
vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
|
|
vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
|
|
vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
|
|
vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
|
|
|
|
if(u1_num_valid_refs)
|
|
{
|
|
/* Right 4x4 Block */
|
|
src0_8x16b = vreinterpretq_s16_u16(vmovl_u8(src0_8x8b));
|
|
src1_8x16b = vreinterpretq_s16_u16(vmovl_u8(src1_8x8b));
|
|
src2_8x16b = vreinterpretq_s16_u16(vmovl_u8(src2_8x8b));
|
|
src3_8x16b = vreinterpretq_s16_u16(vmovl_u8(src3_8x8b));
|
|
|
|
/* add value */
|
|
add_4x32b = vdupq_n_s32(0x4000);
|
|
}
|
|
|
|
/* Run thro all ref ids, except ref==num_ref, which is already done */
|
|
for(ref = 0; ref < u1_num_valid_refs; ref++)
|
|
{
|
|
S32 i4_ref_idx = ai4_wt_refs[ref];
|
|
|
|
/* InvWt and off specific to this ref id */
|
|
inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[i4_ref_idx];
|
|
off = (S16)ps_wt_inp_prms->a_wpred_off[i4_ref_idx];
|
|
|
|
/* set1 uses multiple instructions : Try to AVOID it */
|
|
off_8x16b = vdupq_n_s16(off);
|
|
inv_wt_4x32b = vdupq_n_s32(inv_wt);
|
|
|
|
/* Each ref id may have differnet wt/offset. */
|
|
/* So we have unique inp buf for each ref id */
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp[i4_ref_idx];
|
|
|
|
/* inp - off */
|
|
src4_8x16b = vsubq_s16(src0_8x16b, off_8x16b);
|
|
src5_8x16b = vsubq_s16(src1_8x16b, off_8x16b);
|
|
src6_8x16b = vsubq_s16(src2_8x16b, off_8x16b);
|
|
src7_8x16b = vsubq_s16(src3_8x16b, off_8x16b);
|
|
|
|
dst0_4x32b = vmovl_s16(vget_low_s16(src4_8x16b));
|
|
dst1_4x32b = vmovl_s16(vget_low_s16(src5_8x16b));
|
|
dst2_4x32b = vmovl_s16(vget_low_s16(src6_8x16b));
|
|
dst3_4x32b = vmovl_s16(vget_low_s16(src7_8x16b));
|
|
|
|
dst4_4x32b = vmovl_s16(vget_high_s16(src4_8x16b));
|
|
dst5_4x32b = vmovl_s16(vget_high_s16(src5_8x16b));
|
|
dst6_4x32b = vmovl_s16(vget_high_s16(src6_8x16b));
|
|
dst7_4x32b = vmovl_s16(vget_high_s16(src7_8x16b));
|
|
|
|
/* (inp-off) << shift */
|
|
dst0_4x32b = vshlq_s32(dst0_4x32b, log_wdc);
|
|
dst1_4x32b = vshlq_s32(dst1_4x32b, log_wdc);
|
|
dst2_4x32b = vshlq_s32(dst2_4x32b, log_wdc);
|
|
dst3_4x32b = vshlq_s32(dst3_4x32b, log_wdc);
|
|
|
|
/* (inp-off) << shift */
|
|
dst4_4x32b = vshlq_s32(dst4_4x32b, log_wdc);
|
|
dst5_4x32b = vshlq_s32(dst5_4x32b, log_wdc);
|
|
dst6_4x32b = vshlq_s32(dst6_4x32b, log_wdc);
|
|
dst7_4x32b = vshlq_s32(dst7_4x32b, log_wdc);
|
|
|
|
/* ((inp-off) << shift) * inv_wt + 1<<14 */
|
|
dst0_4x32b = vmlaq_s32(add_4x32b, dst0_4x32b, inv_wt_4x32b);
|
|
dst1_4x32b = vmlaq_s32(add_4x32b, dst1_4x32b, inv_wt_4x32b);
|
|
dst2_4x32b = vmlaq_s32(add_4x32b, dst2_4x32b, inv_wt_4x32b);
|
|
dst3_4x32b = vmlaq_s32(add_4x32b, dst3_4x32b, inv_wt_4x32b);
|
|
|
|
/* ((inp-off) << shift) * inv_wt + 1<<14 */
|
|
dst4_4x32b = vmlaq_s32(add_4x32b, dst4_4x32b, inv_wt_4x32b);
|
|
dst5_4x32b = vmlaq_s32(add_4x32b, dst5_4x32b, inv_wt_4x32b);
|
|
dst6_4x32b = vmlaq_s32(add_4x32b, dst6_4x32b, inv_wt_4x32b);
|
|
dst7_4x32b = vmlaq_s32(add_4x32b, dst7_4x32b, inv_wt_4x32b);
|
|
|
|
/* (((inp-off) << shift) * inv_wt + 1<<14) >> 15 */
|
|
src4_8x16b = vcombine_s16(
|
|
vshrn_n_s32(dst0_4x32b, IHEVCE_WT_PRED_SHIFT),
|
|
vshrn_n_s32(dst4_4x32b, IHEVCE_WT_PRED_SHIFT));
|
|
src5_8x16b = vcombine_s16(
|
|
vshrn_n_s32(dst1_4x32b, IHEVCE_WT_PRED_SHIFT),
|
|
vshrn_n_s32(dst5_4x32b, IHEVCE_WT_PRED_SHIFT));
|
|
src6_8x16b = vcombine_s16(
|
|
vshrn_n_s32(dst2_4x32b, IHEVCE_WT_PRED_SHIFT),
|
|
vshrn_n_s32(dst6_4x32b, IHEVCE_WT_PRED_SHIFT));
|
|
src7_8x16b = vcombine_s16(
|
|
vshrn_n_s32(dst3_4x32b, IHEVCE_WT_PRED_SHIFT),
|
|
vshrn_n_s32(dst7_4x32b, IHEVCE_WT_PRED_SHIFT));
|
|
/* Store */
|
|
vst1_u8((pu1_dst + 0 * dst_stride), vqmovun_s16(src4_8x16b));
|
|
vst1_u8((pu1_dst + 1 * dst_stride), vqmovun_s16(src5_8x16b));
|
|
vst1_u8((pu1_dst + 2 * dst_stride), vqmovun_s16(src6_8x16b));
|
|
vst1_u8((pu1_dst + 3 * dst_stride), vqmovun_s16(src7_8x16b));
|
|
}
|
|
}
|
|
|
|
void hme_get_wt_inp_8x8_neon(
|
|
layer_ctxt_t *ps_curr_layer,
|
|
wgt_pred_ctxt_t *ps_wt_inp_prms,
|
|
S32 dst_stride,
|
|
S32 pos_x,
|
|
S32 pos_y,
|
|
S32 size,
|
|
S32 num_ref,
|
|
U08 u1_is_wt_pred_on)
|
|
{
|
|
WORD32 ref;
|
|
UWORD8 *pu1_src, *pu1_dst;
|
|
WORD32 x_count, y_count;
|
|
WORD32 src_stride = ps_curr_layer->i4_inp_stride;
|
|
|
|
/* Make sure the start positions of block are inside frame limits */
|
|
pos_x = MIN(pos_x, ps_curr_layer->i4_wd - 1);
|
|
pos_y = MIN(pos_y, ps_curr_layer->i4_ht - 1);
|
|
|
|
/* In case we handle imcomplete CTBs, we copy only as much as reqd */
|
|
/* from input buffers to prevent out of bound accesses. In this */
|
|
/* case, we do padding in x or y or both dirns */
|
|
x_count = MIN(size, (ps_curr_layer->i4_wd - pos_x));
|
|
y_count = MIN(size, (ps_curr_layer->i4_ht - pos_y));
|
|
|
|
/* Fixed source */
|
|
pu1_src = ps_curr_layer->pu1_inp;
|
|
pu1_src += (pos_x + (pos_y * src_stride));
|
|
|
|
if(!u1_is_wt_pred_on)
|
|
{
|
|
uint8x8_t src0_8x8b, src1_8x8b, src2_8x8b, src3_8x8b;
|
|
|
|
/************* Top 4x8 Processing ****************/
|
|
/* Load Source : Lower 64 bit */
|
|
src0_8x8b = vld1_u8(pu1_src + 0 * src_stride);
|
|
src1_8x8b = vld1_u8(pu1_src + 1 * src_stride);
|
|
src2_8x8b = vld1_u8(pu1_src + 2 * src_stride);
|
|
src3_8x8b = vld1_u8(pu1_src + 3 * src_stride);
|
|
|
|
/* ref==num_ref */ /* last ref will be non weighted input */
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
|
|
/* Store */
|
|
vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
|
|
vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
|
|
vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
|
|
vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
|
|
|
|
/************* Bottom 4x8 Processing ****************/
|
|
pu1_src += 4 * src_stride;
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref] + 4 * dst_stride;
|
|
|
|
/* Load Source : Lower 64 bit */
|
|
src0_8x8b = vld1_u8(pu1_src + 0 * src_stride);
|
|
src1_8x8b = vld1_u8(pu1_src + 1 * src_stride);
|
|
src2_8x8b = vld1_u8(pu1_src + 2 * src_stride);
|
|
src3_8x8b = vld1_u8(pu1_src + 3 * src_stride);
|
|
/* ref==num_ref */ /* last ref will be non weighted input */
|
|
/* Store */
|
|
vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
|
|
vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
|
|
vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
|
|
vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
|
|
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
|
|
|
|
if(x_count != size)
|
|
{
|
|
hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
|
|
}
|
|
|
|
/* Check and do padding in bottom directino if need be */
|
|
if(y_count != size)
|
|
{
|
|
hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
|
|
}
|
|
|
|
for(ref = 0; ref < num_ref + 1; ref++)
|
|
{
|
|
ps_wt_inp_prms->apu1_wt_inp[ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
S32 wt, off;
|
|
S32 ai4_wt_refs[MAX_NUM_REF];
|
|
U08 u1_num_valid_refs = 0;
|
|
|
|
for(ref = 0; ref < num_ref; ref++)
|
|
{
|
|
wt = ps_wt_inp_prms->a_wpred_wt[ref];
|
|
off = ps_wt_inp_prms->a_wpred_off[ref];
|
|
|
|
if((WGHT_DEFAULT == wt) && (0 == off))
|
|
{
|
|
ps_wt_inp_prms->apu1_wt_inp[ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
|
|
}
|
|
else
|
|
{
|
|
ai4_wt_refs[u1_num_valid_refs++] = ref;
|
|
ps_wt_inp_prms->apu1_wt_inp[ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[ref];
|
|
}
|
|
}
|
|
|
|
ps_wt_inp_prms->apu1_wt_inp[num_ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
|
|
|
|
/************* Top 4x8 Processing ****************/
|
|
/* ref==num_ref */ /* last ref will be non weighted input */
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
|
|
ihevce_get_wt_inp_4x8_neon(
|
|
pu1_src,
|
|
pu1_dst,
|
|
ps_wt_inp_prms,
|
|
u1_num_valid_refs,
|
|
ai4_wt_refs,
|
|
src_stride,
|
|
dst_stride);
|
|
/************* Bottom 4x8 Processing ****************/
|
|
pu1_src += 4 * src_stride;
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref] + 4 * dst_stride;
|
|
ihevce_get_wt_inp_4x8_neon(
|
|
pu1_src,
|
|
pu1_dst,
|
|
ps_wt_inp_prms,
|
|
u1_num_valid_refs,
|
|
ai4_wt_refs,
|
|
src_stride,
|
|
dst_stride);
|
|
|
|
for(ref = 0; ref < u1_num_valid_refs; ref++)
|
|
{
|
|
/* Check and do padding in right direction if need be */
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp[ai4_wt_refs[ref]];
|
|
if(x_count != size)
|
|
{
|
|
hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
|
|
}
|
|
|
|
/* Check and do padding in bottom directino if need be */
|
|
if(y_count != size)
|
|
{
|
|
hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
|
|
}
|
|
}
|
|
|
|
/* Check and do padding in right direction if need be */
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
|
|
if(x_count != size)
|
|
{
|
|
hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
|
|
}
|
|
|
|
/* Check and do padding in bottom directino if need be */
|
|
if(y_count != size)
|
|
{
|
|
hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
|
|
}
|
|
}
|
|
}
|
|
|
|
void hme_get_wt_inp_ctb_neon(
|
|
layer_ctxt_t *ps_curr_layer,
|
|
wgt_pred_ctxt_t *ps_wt_inp_prms,
|
|
S32 dst_stride,
|
|
S32 pos_x,
|
|
S32 pos_y,
|
|
S32 size,
|
|
S32 num_ref,
|
|
U08 u1_is_wt_pred_on)
|
|
{
|
|
WORD32 ref, i, j;
|
|
UWORD8 *pu1_src, *pu1_dst;
|
|
WORD32 x_count, y_count;
|
|
WORD32 src_stride = ps_curr_layer->i4_inp_stride;
|
|
|
|
/* In case we handle imcomplete CTBs, we copy only as much as reqd */
|
|
/* from input buffers to prevent out of bound accesses. In this */
|
|
/* case, we do padding in x or y or both dirns */
|
|
x_count = MIN(size, (ps_curr_layer->i4_wd - pos_x));
|
|
y_count = MIN(size, (ps_curr_layer->i4_ht - pos_y));
|
|
|
|
/* Fixed source */
|
|
pu1_src = ps_curr_layer->pu1_inp;
|
|
pu1_src += (pos_x + (pos_y * src_stride));
|
|
|
|
if(!u1_is_wt_pred_on)
|
|
{
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
|
|
|
|
if(0 == (x_count & 15))
|
|
{
|
|
uint8x16_t src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
|
|
|
|
for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
|
|
{
|
|
for(j = 0; j < x_count; j += 16) /* 16 cols at a time */
|
|
{
|
|
/* Load 4x16 Source */
|
|
src0_16x8b = vld1q_u8(pu1_src + 0 * src_stride);
|
|
src1_16x8b = vld1q_u8(pu1_src + 1 * src_stride);
|
|
src2_16x8b = vld1q_u8(pu1_src + 2 * src_stride);
|
|
src3_16x8b = vld1q_u8(pu1_src + 3 * src_stride);
|
|
|
|
/* ref==num_ref */ /* last ref will be non weighted input */
|
|
/* Store */
|
|
vst1q_u8((pu1_dst + 0 * dst_stride), src0_16x8b);
|
|
vst1q_u8((pu1_dst + 1 * dst_stride), src1_16x8b);
|
|
vst1q_u8((pu1_dst + 2 * dst_stride), src2_16x8b);
|
|
vst1q_u8((pu1_dst + 3 * dst_stride), src3_16x8b);
|
|
|
|
pu1_src += 16;
|
|
pu1_dst += 16;
|
|
}
|
|
|
|
pu1_src = pu1_src - x_count + 4 * src_stride;
|
|
pu1_dst = pu1_dst - x_count + 4 * dst_stride;
|
|
}
|
|
}
|
|
else if(0 == (x_count & 7)) /* wd multiple of 8 case */
|
|
{
|
|
uint8x8_t src0_8x8b, src1_8x8b, src2_8x8b, src3_8x8b;
|
|
for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
|
|
{
|
|
for(j = 0; j < x_count; j += 8) /* 8 cols at a time */
|
|
{
|
|
/* Load 4x8 Source */
|
|
src0_8x8b = vld1_u8(pu1_src + 0 * src_stride);
|
|
src1_8x8b = vld1_u8(pu1_src + 1 * src_stride);
|
|
src2_8x8b = vld1_u8(pu1_src + 2 * src_stride);
|
|
src3_8x8b = vld1_u8(pu1_src + 3 * src_stride);
|
|
|
|
/* ref==num_ref */ /* last ref will be non weighted input */
|
|
/* Store */
|
|
vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
|
|
vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
|
|
vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
|
|
vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
|
|
|
|
pu1_src += 8;
|
|
pu1_dst += 8;
|
|
}
|
|
|
|
pu1_src = pu1_src - x_count + 4 * src_stride;
|
|
pu1_dst = pu1_dst - x_count + 4 * dst_stride;
|
|
}
|
|
}
|
|
else /* wd multiple of 4 case */
|
|
{
|
|
for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
|
|
{
|
|
for(j = 0; j < x_count; j += 4) /* 4 cols at a time */
|
|
{
|
|
/* ref==num_ref */ /* last ref will be non weighted input */
|
|
*(WORD32 *)(&pu1_dst[0 * dst_stride]) = *(WORD32 *)(&pu1_src[0 * src_stride]);
|
|
*(WORD32 *)(&pu1_dst[1 * dst_stride]) = *(WORD32 *)(&pu1_src[1 * src_stride]);
|
|
*(WORD32 *)(&pu1_dst[2 * dst_stride]) = *(WORD32 *)(&pu1_src[2 * src_stride]);
|
|
*(WORD32 *)(&pu1_dst[3 * dst_stride]) = *(WORD32 *)(&pu1_src[3 * src_stride]);
|
|
|
|
pu1_src += 4;
|
|
pu1_dst += 4;
|
|
}
|
|
|
|
pu1_src -= x_count + 4 * src_stride;
|
|
pu1_dst = pu1_dst - x_count + 4 * dst_stride;
|
|
}
|
|
}
|
|
|
|
for(i = 0; i < num_ref + 1; i++)
|
|
{
|
|
ps_wt_inp_prms->apu1_wt_inp[i] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
|
|
}
|
|
|
|
/* Padding */
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
|
|
|
|
if(x_count != size)
|
|
{
|
|
hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
|
|
}
|
|
|
|
/* Check and do padding in bottom directino if need be */
|
|
if(y_count != size)
|
|
{
|
|
hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
S32 ai4_wt_refs[MAX_NUM_REF];
|
|
U08 u1_num_valid_refs = 0;
|
|
int32x4_t dst0_4x32b, dst1_4x32b, dst2_4x32b, dst3_4x32b;
|
|
int32x4_t inv_wt_4x32b, off_4x32b;
|
|
int16x8_t src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
|
|
|
|
/* add value */
|
|
int32x4_t add_4x32b = vdupq_n_s32(0x4000);
|
|
int32x4_t log_wdc = vdupq_n_s32(ps_wt_inp_prms->wpred_log_wdc);
|
|
|
|
for(i = 0; i < num_ref; i++)
|
|
{
|
|
if((WGHT_DEFAULT == (ps_wt_inp_prms->a_wpred_wt[i])) &&
|
|
(0 == (ps_wt_inp_prms->a_wpred_off[i])))
|
|
{
|
|
ps_wt_inp_prms->apu1_wt_inp[i] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
|
|
}
|
|
else
|
|
{
|
|
ai4_wt_refs[u1_num_valid_refs++] = i;
|
|
ps_wt_inp_prms->apu1_wt_inp[i] = ps_wt_inp_prms->apu1_wt_inp_buf_array[i];
|
|
}
|
|
}
|
|
|
|
ps_wt_inp_prms->apu1_wt_inp[num_ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
|
|
|
|
if(0 == (x_count & 7)) /* wd multiple of 8 case */
|
|
{
|
|
uint8x8_t src0_8x8b, src1_8x8b, src2_8x8b, src3_8x8b;
|
|
int16x8_t src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b, off_8x16b;
|
|
int32x4_t dst4_4x32b, dst5_4x32b, dst6_4x32b, dst7_4x32b;
|
|
|
|
for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
|
|
{
|
|
for(j = 0; j < x_count; j += 8) /* 8 cols at a time */
|
|
{
|
|
/* Load 4x8 Source */
|
|
/* Load Source : Lower 32 bit, Upper 32 bit neglected */
|
|
src0_8x8b = vld1_u8(pu1_src + 0 * src_stride);
|
|
src1_8x8b = vld1_u8(pu1_src + 1 * src_stride);
|
|
src2_8x8b = vld1_u8(pu1_src + 2 * src_stride);
|
|
src3_8x8b = vld1_u8(pu1_src + 3 * src_stride);
|
|
|
|
/* ref==num_ref */ /* last ref will be non weighted input */
|
|
pu1_dst = (ps_wt_inp_prms->apu1_wt_inp[num_ref]) + (i * dst_stride) + j;
|
|
|
|
/* Store */
|
|
vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
|
|
vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
|
|
vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
|
|
vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
|
|
|
|
if(u1_num_valid_refs)
|
|
{
|
|
/* Right 4x4 Block */
|
|
src0_8x16b = vreinterpretq_s16_u16(vmovl_u8(src0_8x8b));
|
|
src1_8x16b = vreinterpretq_s16_u16(vmovl_u8(src1_8x8b));
|
|
src2_8x16b = vreinterpretq_s16_u16(vmovl_u8(src2_8x8b));
|
|
src3_8x16b = vreinterpretq_s16_u16(vmovl_u8(src3_8x8b));
|
|
}
|
|
|
|
/* Run thro all ref ids, except ref==num_ref, which is already done */
|
|
for(ref = 0; ref < u1_num_valid_refs; ref++)
|
|
{
|
|
U08 u1_ref_idx = ai4_wt_refs[ref];
|
|
|
|
/* Each ref id may have differnet wt/offset. */
|
|
/* So we have unique inp buf for each ref id */
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp[u1_ref_idx] + (i * dst_stride) + j;
|
|
|
|
/* InvWt and off specific to this ref id */
|
|
off_8x16b = vdupq_n_s16(ps_wt_inp_prms->a_wpred_off[u1_ref_idx]);
|
|
inv_wt_4x32b = vdupq_n_s32(ps_wt_inp_prms->a_inv_wpred_wt[u1_ref_idx]);
|
|
|
|
/* inp - off */
|
|
src4_8x16b = vsubq_s16(src0_8x16b, off_8x16b);
|
|
src5_8x16b = vsubq_s16(src1_8x16b, off_8x16b);
|
|
src6_8x16b = vsubq_s16(src2_8x16b, off_8x16b);
|
|
src7_8x16b = vsubq_s16(src3_8x16b, off_8x16b);
|
|
|
|
dst0_4x32b = vmovl_s16(vget_low_s16(src4_8x16b));
|
|
dst1_4x32b = vmovl_s16(vget_low_s16(src5_8x16b));
|
|
dst2_4x32b = vmovl_s16(vget_low_s16(src6_8x16b));
|
|
dst3_4x32b = vmovl_s16(vget_low_s16(src7_8x16b));
|
|
|
|
dst4_4x32b = vmovl_s16(vget_high_s16(src4_8x16b));
|
|
dst5_4x32b = vmovl_s16(vget_high_s16(src5_8x16b));
|
|
dst6_4x32b = vmovl_s16(vget_high_s16(src6_8x16b));
|
|
dst7_4x32b = vmovl_s16(vget_high_s16(src7_8x16b));
|
|
|
|
/* (inp-off) << shift */
|
|
dst0_4x32b = vshlq_s32(dst0_4x32b, log_wdc);
|
|
dst1_4x32b = vshlq_s32(dst1_4x32b, log_wdc);
|
|
dst2_4x32b = vshlq_s32(dst2_4x32b, log_wdc);
|
|
dst3_4x32b = vshlq_s32(dst3_4x32b, log_wdc);
|
|
|
|
/* (inp-off) << shift */
|
|
dst4_4x32b = vshlq_s32(dst4_4x32b, log_wdc);
|
|
dst5_4x32b = vshlq_s32(dst5_4x32b, log_wdc);
|
|
dst6_4x32b = vshlq_s32(dst6_4x32b, log_wdc);
|
|
dst7_4x32b = vshlq_s32(dst7_4x32b, log_wdc);
|
|
|
|
/* ((inp-off) << shift) * inv_wt + 1<<14 */
|
|
dst0_4x32b = vmlaq_s32(add_4x32b, dst0_4x32b, inv_wt_4x32b);
|
|
dst1_4x32b = vmlaq_s32(add_4x32b, dst1_4x32b, inv_wt_4x32b);
|
|
dst2_4x32b = vmlaq_s32(add_4x32b, dst2_4x32b, inv_wt_4x32b);
|
|
dst3_4x32b = vmlaq_s32(add_4x32b, dst3_4x32b, inv_wt_4x32b);
|
|
|
|
/* ((inp-off) << shift) * inv_wt + 1<<14 */
|
|
dst4_4x32b = vmlaq_s32(add_4x32b, dst4_4x32b, inv_wt_4x32b);
|
|
dst5_4x32b = vmlaq_s32(add_4x32b, dst5_4x32b, inv_wt_4x32b);
|
|
dst6_4x32b = vmlaq_s32(add_4x32b, dst6_4x32b, inv_wt_4x32b);
|
|
dst7_4x32b = vmlaq_s32(add_4x32b, dst7_4x32b, inv_wt_4x32b);
|
|
|
|
/* (((inp-off) << shift) * inv_wt + 1<<14) >> 15 */
|
|
src4_8x16b = vcombine_s16(
|
|
vshrn_n_s32(dst0_4x32b, IHEVCE_WT_PRED_SHIFT),
|
|
vshrn_n_s32(dst4_4x32b, IHEVCE_WT_PRED_SHIFT));
|
|
src5_8x16b = vcombine_s16(
|
|
vshrn_n_s32(dst1_4x32b, IHEVCE_WT_PRED_SHIFT),
|
|
vshrn_n_s32(dst5_4x32b, IHEVCE_WT_PRED_SHIFT));
|
|
src6_8x16b = vcombine_s16(
|
|
vshrn_n_s32(dst2_4x32b, IHEVCE_WT_PRED_SHIFT),
|
|
vshrn_n_s32(dst6_4x32b, IHEVCE_WT_PRED_SHIFT));
|
|
src7_8x16b = vcombine_s16(
|
|
vshrn_n_s32(dst3_4x32b, IHEVCE_WT_PRED_SHIFT),
|
|
vshrn_n_s32(dst7_4x32b, IHEVCE_WT_PRED_SHIFT));
|
|
/* Store */
|
|
vst1_u8((pu1_dst + 0 * dst_stride), vqmovun_s16(src4_8x16b));
|
|
vst1_u8((pu1_dst + 1 * dst_stride), vqmovun_s16(src5_8x16b));
|
|
vst1_u8((pu1_dst + 2 * dst_stride), vqmovun_s16(src6_8x16b));
|
|
vst1_u8((pu1_dst + 3 * dst_stride), vqmovun_s16(src7_8x16b));
|
|
}
|
|
/* Pointer update */
|
|
pu1_src += 8;
|
|
}
|
|
/* Pointer update */
|
|
pu1_src = pu1_src - x_count + 4 * src_stride;
|
|
}
|
|
}
|
|
else /* wd multiple of 4 case */
|
|
{
|
|
uint8x16_t src0_16x8b;
|
|
int32x4_t src0_4x32b, src1_4x32b, src2_4x32b, src3_4x32b;
|
|
WORD32 dst0, dst1, dst2, dst3;
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
|
|
for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
|
|
{
|
|
for(j = 0; j < x_count; j += 4) /* 4 cols at a time */
|
|
{
|
|
/* ref==num_ref */ /* last ref will be non weighted input */
|
|
|
|
*(WORD32 *)(&pu1_dst[0 * dst_stride]) = *(WORD32 *)(&pu1_src[0 * src_stride]);
|
|
*(WORD32 *)(&pu1_dst[1 * dst_stride]) = *(WORD32 *)(&pu1_src[1 * src_stride]);
|
|
*(WORD32 *)(&pu1_dst[2 * dst_stride]) = *(WORD32 *)(&pu1_src[2 * src_stride]);
|
|
*(WORD32 *)(&pu1_dst[3 * dst_stride]) = *(WORD32 *)(&pu1_src[3 * src_stride]);
|
|
|
|
/* Pointer update */
|
|
pu1_src += 4;
|
|
pu1_dst += 4;
|
|
}
|
|
/* Pointer update */
|
|
pu1_src = pu1_src - x_count + 4 * src_stride;
|
|
pu1_dst = pu1_dst - x_count + 4 * dst_stride;
|
|
}
|
|
|
|
if(u1_num_valid_refs)
|
|
{
|
|
pu1_src = ps_curr_layer->pu1_inp;
|
|
pu1_src += (pos_x + (pos_y * src_stride));
|
|
|
|
/* Run thro all ref ids, except ref==num_ref, which is already done */
|
|
for(ref = 0; ref < u1_num_valid_refs; ref++)
|
|
{
|
|
U08 u1_ref_idx = ai4_wt_refs[ref];
|
|
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp[u1_ref_idx];
|
|
|
|
/* InvWt and off specific to this ref id */
|
|
off_4x32b = vdupq_n_s32(ps_wt_inp_prms->a_wpred_off[u1_ref_idx]);
|
|
inv_wt_4x32b = vdupq_n_s32(ps_wt_inp_prms->a_inv_wpred_wt[u1_ref_idx]);
|
|
|
|
for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
|
|
{
|
|
for(j = 0; j < x_count; j += 4) /* 4 cols at a time */
|
|
{
|
|
src0_16x8b = load_unaligned_u8q(pu1_src, src_stride);
|
|
|
|
src0_8x16b = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src0_16x8b)));
|
|
src1_8x16b = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src0_16x8b)));
|
|
|
|
src0_4x32b = vmovl_s16(vget_low_s16(src0_8x16b));
|
|
src1_4x32b = vmovl_s16(vget_high_s16(src0_8x16b));
|
|
src2_4x32b = vmovl_s16(vget_low_s16(src1_8x16b));
|
|
src3_4x32b = vmovl_s16(vget_high_s16(src1_8x16b));
|
|
|
|
/* inp - off */
|
|
dst0_4x32b = vsubq_s32(src0_4x32b, off_4x32b);
|
|
dst1_4x32b = vsubq_s32(src1_4x32b, off_4x32b);
|
|
dst2_4x32b = vsubq_s32(src2_4x32b, off_4x32b);
|
|
dst3_4x32b = vsubq_s32(src3_4x32b, off_4x32b);
|
|
|
|
/* (inp-off) << shift */
|
|
dst0_4x32b = vshlq_s32(dst0_4x32b, log_wdc);
|
|
dst1_4x32b = vshlq_s32(dst1_4x32b, log_wdc);
|
|
dst2_4x32b = vshlq_s32(dst2_4x32b, log_wdc);
|
|
dst3_4x32b = vshlq_s32(dst3_4x32b, log_wdc);
|
|
|
|
/* ((inp-off) << shift) * inv_wt */
|
|
dst0_4x32b = vmlaq_s32(add_4x32b, dst0_4x32b, inv_wt_4x32b);
|
|
dst1_4x32b = vmlaq_s32(add_4x32b, dst1_4x32b, inv_wt_4x32b);
|
|
dst2_4x32b = vmlaq_s32(add_4x32b, dst2_4x32b, inv_wt_4x32b);
|
|
dst3_4x32b = vmlaq_s32(add_4x32b, dst3_4x32b, inv_wt_4x32b);
|
|
|
|
/* (((inp-off) << shift) * inv_wt + 1<<14) >> 15 */
|
|
dst0 = (WORD32)vget_lane_u64(
|
|
vreinterpret_u64_u16(
|
|
vqshrun_n_s32(dst0_4x32b, IHEVCE_WT_PRED_SHIFT)),
|
|
0);
|
|
dst1 = (WORD32)vget_lane_u64(
|
|
vreinterpret_u64_u16(
|
|
vqshrun_n_s32(dst1_4x32b, IHEVCE_WT_PRED_SHIFT)),
|
|
0);
|
|
dst2 = (WORD32)vget_lane_u64(
|
|
vreinterpret_u64_u16(
|
|
vqshrun_n_s32(dst2_4x32b, IHEVCE_WT_PRED_SHIFT)),
|
|
0);
|
|
dst3 = (WORD32)vget_lane_u64(
|
|
vreinterpret_u64_u16(
|
|
vqshrun_n_s32(dst3_4x32b, IHEVCE_WT_PRED_SHIFT)),
|
|
0);
|
|
|
|
*(WORD32 *)(&pu1_dst[0 * dst_stride]) = dst0;
|
|
*(WORD32 *)(&pu1_dst[1 * dst_stride]) = dst1;
|
|
*(WORD32 *)(&pu1_dst[2 * dst_stride]) = dst2;
|
|
*(WORD32 *)(&pu1_dst[3 * dst_stride]) = dst3;
|
|
|
|
/* Pointer update */
|
|
pu1_src += 4;
|
|
pu1_dst += 4;
|
|
}
|
|
/* Pointer update */
|
|
pu1_src = pu1_src - x_count + 4 * src_stride;
|
|
pu1_dst = pu1_dst - x_count + 4 * dst_stride;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Padding */
|
|
for(ref = 0; ref < u1_num_valid_refs; ref++)
|
|
{
|
|
/* Check and do padding in right direction if need be */
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp[ai4_wt_refs[ref]];
|
|
if(x_count != size)
|
|
{
|
|
hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
|
|
}
|
|
|
|
/* Check and do padding in bottom directino if need be */
|
|
if(y_count != size)
|
|
{
|
|
hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
|
|
}
|
|
}
|
|
|
|
/* Check and do padding in right direction if need be */
|
|
pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
|
|
|
|
if(x_count != size)
|
|
{
|
|
hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
|
|
}
|
|
|
|
/* Check and do padding in bottom directino if need be */
|
|
if(y_count != size)
|
|
{
|
|
hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
|
|
}
|
|
}
|
|
}
|