You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
319 lines
10 KiB
319 lines
10 KiB
/******************************************************************************
|
|
*
|
|
* Copyright (C) 2018 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at:
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
*****************************************************************************
|
|
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
*/
|
|
/**
|
|
*******************************************************************************
|
|
* @file
|
|
* ihevce_sad_compute_neon.c
|
|
*
|
|
* @brief
|
|
* Contains definitions of functions to compute sad
|
|
*
|
|
* @author
|
|
* Ittiam
|
|
*
|
|
* @par List of Functions:
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
********************************************************************************
|
|
*/
|
|
/*****************************************************************************/
|
|
/* File Includes */
|
|
/*****************************************************************************/
|
|
/* System include files */
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include <arm_neon.h>
|
|
|
|
/* User include files */
|
|
#include "ihevc_typedefs.h"
|
|
#include "ihevc_macros.h"
|
|
#include "itt_video_api.h"
|
|
#include "ihevc_cmn_utils_neon.h"
|
|
#include "ihevce_ipe_instr_set_router.h"
|
|
|
|
/*****************************************************************************/
|
|
/* Function Definitions */
|
|
/*****************************************************************************/
|
|
UWORD16 ihevce_4x4_sad_computer_neon(
|
|
UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd)
|
|
{
|
|
const uint8x16_t src_u8 = load_unaligned_u8q(pu1_src, src_strd);
|
|
const uint8x16_t ref_u8 = load_unaligned_u8q(pu1_pred, pred_strd);
|
|
uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
|
|
uint32x4_t b;
|
|
uint64x2_t c;
|
|
|
|
abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
|
|
b = vpaddlq_u16(abs);
|
|
c = vpaddlq_u32(b);
|
|
return vget_lane_u32(
|
|
vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)), vreinterpret_u32_u64(vget_high_u64(c))), 0);
|
|
}
|
|
|
|
static UWORD16 ihevce_8xn_sad_computer_neon(
|
|
UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 ht)
|
|
{
|
|
uint16x8_t abs = vdupq_n_u16(0);
|
|
uint32x4_t tmp_a;
|
|
uint64x2_t tmp_b;
|
|
uint32x2_t sad;
|
|
WORD32 i;
|
|
|
|
assert(ht <= 8);
|
|
|
|
for(i = 0; i < ht; i++)
|
|
{
|
|
const uint8x8_t src = vld1_u8(pu1_src);
|
|
const uint8x8_t pred = vld1_u8(pu1_pred);
|
|
|
|
abs = vabal_u8(abs, src, pred);
|
|
pu1_src += src_strd;
|
|
pu1_pred += pred_strd;
|
|
}
|
|
tmp_a = vpaddlq_u16(abs);
|
|
tmp_b = vpaddlq_u32(tmp_a);
|
|
sad = vadd_u32(
|
|
vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
|
|
return vget_lane_u32(sad, 0);
|
|
}
|
|
|
|
static UWORD32 ihevce_16xn_sad_computer_neon(
|
|
UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 ht)
|
|
{
|
|
uint16x8_t abs_0 = vdupq_n_u16(0);
|
|
uint16x8_t abs_1 = vdupq_n_u16(0);
|
|
uint32x4_t tmp_a;
|
|
uint64x2_t tmp_b;
|
|
uint32x2_t sad;
|
|
WORD32 i;
|
|
|
|
assert(ht <= 16);
|
|
|
|
for(i = 0; i < ht; i++)
|
|
{
|
|
const uint8x16_t src = vld1q_u8(pu1_src);
|
|
const uint8x16_t pred = vld1q_u8(pu1_pred);
|
|
|
|
abs_0 = vabal_u8(abs_0, vget_low_u8(src), vget_low_u8(pred));
|
|
abs_1 = vabal_u8(abs_1, vget_high_u8(src), vget_high_u8(pred));
|
|
pu1_src += src_strd;
|
|
pu1_pred += pred_strd;
|
|
}
|
|
tmp_a = vpaddlq_u16(abs_0);
|
|
tmp_a = vpadalq_u16(tmp_a, abs_1);
|
|
tmp_b = vpaddlq_u32(tmp_a);
|
|
sad = vadd_u32(
|
|
vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
|
|
return vget_lane_u32(sad, 0);
|
|
}
|
|
|
|
static UWORD32 ihevce_32xn_sad_computer_neon(
|
|
UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 ht)
|
|
{
|
|
uint16x8_t abs_0 = vdupq_n_u16(0);
|
|
uint16x8_t abs_1 = vdupq_n_u16(0);
|
|
uint32x4_t tmp_a;
|
|
uint64x2_t tmp_b;
|
|
uint32x2_t sad;
|
|
WORD32 i;
|
|
|
|
assert(ht <= 32);
|
|
|
|
for(i = 0; i < ht; i++)
|
|
{
|
|
const uint8x16_t src_0 = vld1q_u8(pu1_src);
|
|
const uint8x16_t pred_0 = vld1q_u8(pu1_pred);
|
|
const uint8x16_t src_1 = vld1q_u8(pu1_src + 16);
|
|
const uint8x16_t pred_1 = vld1q_u8(pu1_pred + 16);
|
|
|
|
abs_0 = vabal_u8(abs_0, vget_low_u8(src_0), vget_low_u8(pred_0));
|
|
abs_0 = vabal_u8(abs_0, vget_high_u8(src_0), vget_high_u8(pred_0));
|
|
abs_1 = vabal_u8(abs_1, vget_low_u8(src_1), vget_low_u8(pred_1));
|
|
abs_1 = vabal_u8(abs_1, vget_high_u8(src_1), vget_high_u8(pred_1));
|
|
pu1_src += src_strd;
|
|
pu1_pred += pred_strd;
|
|
}
|
|
tmp_a = vpaddlq_u16(abs_0);
|
|
tmp_a = vpadalq_u16(tmp_a, abs_1);
|
|
tmp_b = vpaddlq_u32(tmp_a);
|
|
sad = vadd_u32(
|
|
vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
|
|
return vget_lane_u32(sad, 0);
|
|
}
|
|
|
|
static UWORD32 ihevce_64xn_sad_computer_neon(
|
|
UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd, WORD32 ht)
|
|
{
|
|
uint16x8_t abs_0 = vdupq_n_u16(0);
|
|
uint16x8_t abs_1 = vdupq_n_u16(0);
|
|
uint32x4_t tmp_a;
|
|
uint64x2_t tmp_b;
|
|
uint32x2_t sad;
|
|
WORD32 i;
|
|
|
|
assert(ht <= 64);
|
|
|
|
for(i = 0; i < ht; i++)
|
|
{
|
|
const uint8x16_t src_0 = vld1q_u8(pu1_src);
|
|
const uint8x16_t pred_0 = vld1q_u8(pu1_pred);
|
|
const uint8x16_t src_1 = vld1q_u8(pu1_src + 16);
|
|
const uint8x16_t pred_1 = vld1q_u8(pu1_pred + 16);
|
|
const uint8x16_t src_2 = vld1q_u8(pu1_src + 32);
|
|
const uint8x16_t pred_2 = vld1q_u8(pu1_pred + 32);
|
|
const uint8x16_t src_3 = vld1q_u8(pu1_src + 48);
|
|
const uint8x16_t pred_3 = vld1q_u8(pu1_pred + 48);
|
|
|
|
abs_0 = vabal_u8(abs_0, vget_low_u8(src_0), vget_low_u8(pred_0));
|
|
abs_0 = vabal_u8(abs_0, vget_high_u8(src_0), vget_high_u8(pred_0));
|
|
abs_0 = vabal_u8(abs_0, vget_low_u8(src_1), vget_low_u8(pred_1));
|
|
abs_0 = vabal_u8(abs_0, vget_high_u8(src_1), vget_high_u8(pred_1));
|
|
abs_1 = vabal_u8(abs_1, vget_low_u8(src_2), vget_low_u8(pred_2));
|
|
abs_1 = vabal_u8(abs_1, vget_high_u8(src_2), vget_high_u8(pred_2));
|
|
abs_1 = vabal_u8(abs_1, vget_low_u8(src_3), vget_low_u8(pred_3));
|
|
abs_1 = vabal_u8(abs_1, vget_high_u8(src_3), vget_high_u8(pred_3));
|
|
pu1_src += src_strd;
|
|
pu1_pred += pred_strd;
|
|
}
|
|
tmp_a = vpaddlq_u16(abs_0);
|
|
tmp_a = vpadalq_u16(tmp_a, abs_1);
|
|
tmp_b = vpaddlq_u32(tmp_a);
|
|
sad = vadd_u32(
|
|
vreinterpret_u32_u64(vget_low_u64(tmp_b)), vreinterpret_u32_u64(vget_high_u64(tmp_b)));
|
|
return vget_lane_u32(sad, 0);
|
|
}
|
|
|
|
UWORD32 ihevce_4mx4n_sad_computer_neon(
|
|
UWORD8 *pu1_src,
|
|
UWORD8 *pu1_pred,
|
|
WORD32 src_strd,
|
|
WORD32 pred_strd,
|
|
WORD32 blk_wd,
|
|
WORD32 blk_ht)
|
|
{
|
|
WORD32 sad = 0;
|
|
WORD32 i, j;
|
|
|
|
assert(blk_wd % 4 == 0);
|
|
assert(blk_ht % 4 == 0);
|
|
|
|
if(((blk_wd & (blk_wd - 1)) == 0) && (blk_wd <= 64))
|
|
{
|
|
// blk_wd { 4, 8, 16, 32, 64 }
|
|
for(i = 0; i < blk_ht;)
|
|
{
|
|
WORD32 ht = MIN(blk_wd, blk_ht - i);
|
|
|
|
switch(blk_wd)
|
|
{
|
|
case 4:
|
|
sad += ihevce_4x4_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd);
|
|
break;
|
|
case 8:
|
|
sad += ihevce_8xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, ht);
|
|
break;
|
|
case 16:
|
|
sad += ihevce_16xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, ht);
|
|
break;
|
|
case 32:
|
|
sad += ihevce_32xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, ht);
|
|
break;
|
|
case 64:
|
|
sad += ihevce_64xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, ht);
|
|
break;
|
|
default:
|
|
// should not be here
|
|
return -1;
|
|
}
|
|
i += ht;
|
|
pu1_src += (ht * src_strd);
|
|
pu1_pred += (ht * pred_strd);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Generic Case
|
|
for(i = 0; i < blk_ht; i += 4)
|
|
{
|
|
for(j = 0; j < blk_wd;)
|
|
{
|
|
WORD32 wd = blk_wd - j;
|
|
|
|
if(wd >= 32)
|
|
{
|
|
sad += ihevce_32xn_sad_computer_neon(
|
|
pu1_src + j, pu1_pred + j, src_strd, pred_strd, 4);
|
|
j += 32;
|
|
}
|
|
else if(wd >= 16)
|
|
{
|
|
sad += ihevce_16xn_sad_computer_neon(
|
|
pu1_src + j, pu1_pred + j, src_strd, pred_strd, 4);
|
|
j += 16;
|
|
}
|
|
else if(wd >= 8)
|
|
{
|
|
sad += ihevce_8xn_sad_computer_neon(
|
|
pu1_src + j, pu1_pred + j, src_strd, pred_strd, 4);
|
|
j += 8;
|
|
}
|
|
else
|
|
{
|
|
sad += ihevce_4x4_sad_computer_neon(
|
|
pu1_src + j, pu1_pred + j, src_strd, pred_strd);
|
|
j += 4;
|
|
}
|
|
}
|
|
pu1_src += (4 * src_strd);
|
|
pu1_pred += (4 * pred_strd);
|
|
}
|
|
}
|
|
return sad;
|
|
}
|
|
|
|
UWORD16 ihevce_8x8_sad_computer_neon(
|
|
UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd)
|
|
{
|
|
return ihevce_8xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 8);
|
|
}
|
|
|
|
WORD32 ihevce_nxn_sad_computer_neon(
|
|
UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, WORD32 trans_size)
|
|
{
|
|
switch(trans_size)
|
|
{
|
|
case 4:
|
|
return ihevce_4x4_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd);
|
|
case 8:
|
|
return ihevce_8xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 8);
|
|
case 16:
|
|
return ihevce_16xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 16);
|
|
case 32:
|
|
return ihevce_32xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 32);
|
|
case 64:
|
|
return ihevce_64xn_sad_computer_neon(pu1_src, pu1_pred, src_strd, pred_strd, 64);
|
|
default:
|
|
// should not be here
|
|
return -1;
|
|
}
|
|
}
|