You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1264 lines
58 KiB
1264 lines
58 KiB
/******************************************************************************
|
|
*
|
|
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at:
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
******************************************************************************/
|
|
/**
|
|
*******************************************************************************
|
|
* @file
|
|
* ihevc_deblck_atom_intr.c
|
|
*
|
|
* @brief
|
|
* Contains function definitions for deblocking filters
|
|
*
|
|
* @author
|
|
* Rishab
|
|
*
|
|
* @par List of Functions:
|
|
* - ihevc_deblk_luma_vert_ssse3()
|
|
* - ihevc_deblk_luma_horz_ssse3()
|
|
* - ihevc_deblk_chroma_vert_ssse3()
|
|
* - ihevc_deblk_chroma_horz_ssse3()
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <assert.h>
|
|
#include "ihevc_typedefs.h"
|
|
#include "ihevc_platform_macros.h"
|
|
#include "ihevc_macros.h"
|
|
#include "ihevc_deblk.h"
|
|
#include "ihevc_deblk_tables.h"
|
|
#include "ihevc_debug.h"
|
|
|
|
#include "ihevc_tables_x86_intr.h"
|
|
|
|
#include <immintrin.h>
|
|
/**
|
|
*******************************************************************************
|
|
*
|
|
* @brief
|
|
* Decision process and filtering for the luma block vertical edge.
|
|
*
|
|
* @par Description:
|
|
* The decision process for the luma block vertical edge is carried out and
|
|
* an appropriate filter is applied. The boundary filter strength, bs should
|
|
* be greater than 0. The pcm flags and the transquant bypass flags should
|
|
* be taken care of by the calling function.
|
|
*
|
|
* @param[in] pu1_src
|
|
* Pointer to the src sample q(0,0)
|
|
*
|
|
* @param[in] src_strd
|
|
* Source stride
|
|
*
|
|
* @param[in] bs
|
|
* Boundary filter strength of q(0,0)
|
|
*
|
|
* @param[in] quant_param_p
|
|
* quantization parameter of p block
|
|
*
|
|
* @param[in] quant_param_q
|
|
* quantization parameter of p block
|
|
*
|
|
* @param[in] beta_offset_div2
|
|
*
|
|
*
|
|
* @param[in] tc_offset_div2
|
|
*
|
|
*
|
|
* @param[in] filter_flag_p
|
|
* flag whether to filter the p block
|
|
*
|
|
* @param[in] filter_flag_q
|
|
* flag whether to filter the q block
|
|
*
|
|
* @returns
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
|
|
void ihevc_deblk_luma_vert_ssse3(UWORD8 *pu1_src,
|
|
WORD32 src_strd,
|
|
WORD32 bs,
|
|
WORD32 quant_param_p,
|
|
WORD32 quant_param_q,
|
|
WORD32 beta_offset_div2,
|
|
WORD32 tc_offset_div2,
|
|
WORD32 filter_flag_p,
|
|
WORD32 filter_flag_q)
|
|
{
|
|
WORD32 qp_luma, beta_indx, tc_indx;
|
|
WORD32 beta, tc;
|
|
WORD32 d, dp, dq, d_sam0, d_sam3;
|
|
|
|
WORD32 d3, d0, de_0, de_1, de_2, de_3;
|
|
WORD32 de, dep, deq;
|
|
__m128i src_row0_8x16b, src_row1_8x16b, src_row2_8x16b, src_row3_8x16b;
|
|
|
|
|
|
{
|
|
__m128i src_tmp_8x16b, coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
|
|
__m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
|
|
|
|
|
|
|
|
ASSERT((bs > 0) && (bs <= 3));
|
|
ASSERT(filter_flag_p || filter_flag_q);
|
|
|
|
qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
|
|
beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
|
|
|
|
/* BS based on implementation can take value 3 if it is intra/inter egde */
|
|
/* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
|
|
/* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */
|
|
/* the above desired functionallity is achieved by doing (2*(bs>>1)) */
|
|
|
|
tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
|
|
|
|
beta = gai4_ihevc_beta_table[beta_indx];
|
|
tc = gai4_ihevc_tc_table[tc_indx];
|
|
if(0 == tc)
|
|
{
|
|
return;
|
|
}
|
|
src_row0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
|
|
src_row3_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd));
|
|
|
|
coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
|
|
mask_16x8b = _mm_load_si128((__m128i *)(shuffle_d));
|
|
|
|
src_tmp_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row3_8x16b);
|
|
mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_8x16b, mask_16x8b);
|
|
|
|
mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_8x16b, coef_8x16b);
|
|
|
|
|
|
//to get all 1's of 8 bit in (1)
|
|
temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_8x16b, src_tmp_8x16b);
|
|
temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
|
|
//accumulating values foe dp3 dq3 , dp0 dq0 values
|
|
mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
|
|
|
|
temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
|
|
// to get all 1,-1 sets of 16 bits in (0)
|
|
temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
|
|
//q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
|
|
mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
|
|
//to get 16 bit 1's
|
|
temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
|
|
|
|
|
|
// dq3 dp3 dq0 dp0
|
|
mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
|
|
mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
|
|
mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
|
|
// dq dp d3 d0
|
|
mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
|
|
//|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
|
|
mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
|
|
//|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
|
|
mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
|
|
|
|
///store back in a single variable
|
|
temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
|
|
temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
|
|
mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
|
|
|
|
d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
|
|
d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
|
|
dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
|
|
dq = _mm_cvtsi128_si32(mask_16x8b);
|
|
//getting d
|
|
d = d0 + d3;
|
|
|
|
///store back in a single variable
|
|
temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
|
|
temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
|
|
mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
|
|
|
|
de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
|
|
de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
|
|
de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
|
|
de_3 = _mm_cvtsi128_si32(mask_16x8b);
|
|
|
|
de = 0;
|
|
dep = 0;
|
|
deq = 0;
|
|
if(d < beta)
|
|
{
|
|
d_sam0 = 0;
|
|
if((2 * d0 < (beta >> 2))
|
|
&& (de_2 < (beta >> 3))
|
|
&& (de_0 < ((5 * tc + 1) >> 1)))
|
|
{
|
|
d_sam0 = 1;
|
|
}
|
|
|
|
d_sam3 = 0;
|
|
if((2 * d3 < (beta >> 2))
|
|
&& (de_3 < (beta >> 3))
|
|
&& de_1 < ((5 * tc + 1) >> 1))
|
|
{
|
|
d_sam3 = 1;
|
|
}
|
|
|
|
de = (d_sam0 & d_sam3) + 1;
|
|
dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
|
|
deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
|
|
if(tc <= 1)
|
|
{
|
|
dep = 0;
|
|
deq = 0;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
if(de != 0)
|
|
{
|
|
|
|
|
|
src_row1_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + src_strd));
|
|
src_row2_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd));
|
|
|
|
if(de == 2)
|
|
{
|
|
__m128i temp_pq_str0_16x8b;
|
|
__m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
|
|
__m128i temp_pq2_str0_16x8b;
|
|
__m128i temp_pq_str1_16x8b;
|
|
__m128i temp_str0_16x8b, temp_str1_16x8b, temp_str2_16x8b, temp_str3_16x8b;
|
|
__m128i temp_max0_16x8b, temp_max1_16x8b, temp_min0_16x8b, temp_min1_16x8b;
|
|
__m128i const2_8x16b, const2tc_8x16b;
|
|
LWORD64 mask, tc2;
|
|
tc = tc << 1;
|
|
mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
|
|
tc2 = ((LWORD64)tc);
|
|
|
|
const2_8x16b = _mm_cmpeq_epi16(src_row0_8x16b, src_row0_8x16b);
|
|
//q'0-q'1-2 ,p'0-p'1-2
|
|
src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row2_8x16b);
|
|
src_row1_8x16b = _mm_unpacklo_epi64(src_row1_8x16b, src_row3_8x16b);
|
|
|
|
const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
|
|
temp_pq_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 16);
|
|
temp_pq_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 16);
|
|
//arranged x x x x x x x x q31 q30 q1 q10 p30 p31 p10 p11 , x x x x x x x x q21 q20 q01 q00 p20 p21 p00 p01
|
|
temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
|
|
temp_str1_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
|
|
|
|
const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
|
|
//arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
|
|
temp_pq_str0_16x8b = _mm_unpacklo_epi32(temp_str0_16x8b, temp_str1_16x8b);
|
|
|
|
temp_pq_str0_16x8b = _mm_maddubs_epi16(temp_pq_str0_16x8b, const2_8x16b);
|
|
|
|
//q'1-2, p'1-2
|
|
temp_pq1_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 8);
|
|
temp_pq1_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 8);
|
|
|
|
temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
|
|
temp_str3_16x8b = _mm_unpackhi_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
|
|
|
|
temp_str2_16x8b = _mm_shuffle_epi32(temp_str2_16x8b, 0x58);
|
|
temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x58);
|
|
// q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
|
|
temp_pq1_str0_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str3_16x8b);
|
|
// q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
|
|
temp_pq1_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str3_16x8b);
|
|
|
|
temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
|
|
temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
|
|
|
|
//clipping mask design
|
|
temp_str1_16x8b = _mm_setzero_si128();
|
|
temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
|
|
const2tc_8x16b = _mm_loadl_epi64((__m128i *)(&tc2));
|
|
temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
|
|
const2tc_8x16b = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
|
|
|
|
//clipping mask design
|
|
temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
|
|
const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
|
|
//calculating Clipping MAX for all pixel values.
|
|
temp_max0_16x8b = _mm_adds_epu8(src_row0_8x16b, const2tc_8x16b);
|
|
temp_max1_16x8b = _mm_adds_epu8(src_row1_8x16b, const2tc_8x16b);
|
|
|
|
|
|
//q'2-q'0-2,p'2-p'0-2
|
|
temp_pq2_str0_16x8b = _mm_unpacklo_epi16(src_row0_8x16b, src_row2_8x16b);
|
|
temp_str3_16x8b = _mm_unpacklo_epi16(src_row1_8x16b, src_row3_8x16b);
|
|
|
|
temp_pq2_str0_16x8b = _mm_shuffle_epi32(temp_pq2_str0_16x8b, 0x5c);
|
|
temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x5c);
|
|
|
|
const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
|
|
//arranged q33 q32 q23 q22 q13 q12 q03 q02 p33 p32 p23 p22 p13 p12 p03 p02
|
|
temp_str3_16x8b = _mm_unpacklo_epi16(temp_pq2_str0_16x8b, temp_str3_16x8b);
|
|
|
|
temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_str3_16x8b, const2_8x16b);
|
|
|
|
//calculating Clipping MIN for all pixel values.
|
|
temp_min0_16x8b = _mm_subs_epu8(src_row0_8x16b, const2tc_8x16b);
|
|
temp_min1_16x8b = _mm_subs_epu8(src_row1_8x16b, const2tc_8x16b);
|
|
//q'0-q'1-2 ,p'0-p'1-2
|
|
temp_pq_str1_16x8b = _mm_shuffle_epi32(temp_pq_str0_16x8b, 0x4e);
|
|
temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
|
|
//q'1-2 p'1-2
|
|
temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
|
|
//to get 2 in 16 bit
|
|
const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
|
|
//to get q33 q23 q13 q03, p33 p23 p13 p03
|
|
temp_pq1_str1_16x8b = _mm_slli_epi16(temp_str3_16x8b, 8);
|
|
temp_pq_str1_16x8b = _mm_srli_epi16(temp_str3_16x8b, 8);
|
|
temp_pq1_str1_16x8b = _mm_srli_epi16(temp_pq1_str1_16x8b, 8);
|
|
|
|
//q'1, p'1 (adding 2)
|
|
temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
|
|
//q'0-q'1,p'0-p'1
|
|
temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, const2_8x16b);
|
|
//q'2-q'1,p'2-p'1
|
|
temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
|
|
//q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
|
|
temp_pq_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
|
|
//q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
|
|
temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
|
|
|
|
//normalisation of all modified pixels
|
|
temp_pq_str0_16x8b = _mm_srai_epi16(temp_pq_str0_16x8b, 3);
|
|
temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
|
|
temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
|
|
|
|
//getting p0 p1 together and p2 p3 together
|
|
temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
|
|
temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str1_16x8b, temp_pq2_str0_16x8b);
|
|
//getting q1 q0 together and q3 q2 together
|
|
temp_pq_str0_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq1_str0_16x8b);
|
|
temp_pq2_str0_16x8b = _mm_unpackhi_epi16(temp_pq2_str0_16x8b, temp_pq_str1_16x8b);
|
|
//getting p's of row0 row1 together and of row2 row3 together
|
|
temp_pq_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str0_16x8b);
|
|
temp_str2_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str0_16x8b);
|
|
//getting q's of row0 row1 together and of row2 row3 together
|
|
temp_str0_16x8b = _mm_unpacklo_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
|
|
temp_pq_str0_16x8b = _mm_unpackhi_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
|
|
//getting values for respective rows in 16 bit
|
|
src_row0_8x16b = _mm_unpacklo_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
|
|
src_row1_8x16b = _mm_unpackhi_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
|
|
src_row2_8x16b = _mm_unpacklo_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
|
|
src_row3_8x16b = _mm_unpackhi_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
|
|
//packing values to 8 bit
|
|
src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row2_8x16b);
|
|
src_row1_8x16b = _mm_packus_epi16(src_row1_8x16b, src_row3_8x16b);
|
|
//Clipping MAX
|
|
src_row0_8x16b = _mm_min_epu8(src_row0_8x16b, temp_max0_16x8b);
|
|
src_row1_8x16b = _mm_min_epu8(src_row1_8x16b, temp_max1_16x8b);
|
|
//Clipping MIN
|
|
src_row0_8x16b = _mm_max_epu8(src_row0_8x16b, temp_min0_16x8b);
|
|
src_row1_8x16b = _mm_max_epu8(src_row1_8x16b, temp_min1_16x8b);
|
|
//separating row 2 and row 3
|
|
src_row2_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
|
|
src_row3_8x16b = _mm_srli_si128(src_row1_8x16b, 8);
|
|
|
|
}
|
|
|
|
else
|
|
{
|
|
|
|
__m128i tmp_delta0_8x16b, tmp_delta1_8x16b, tmp_delta2_8x16b, tmp_delta3_8x16b;
|
|
__m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b, tmp3_const_8x16b;
|
|
__m128i coefdelta_0_8x16b, mask_pq_8x16b;
|
|
__m128i const2_8x16b, consttc_8x16b;
|
|
|
|
LWORD64 mask1;
|
|
mask1 = (((LWORD64)(filter_flag_q & deq)) << 63) | (((LWORD64)filter_flag_q) << 47) | (((LWORD64)filter_flag_p) << 31) | (((LWORD64)(filter_flag_p & dep)) << 15);
|
|
|
|
consttc_8x16b = _mm_set1_epi32(tc);
|
|
|
|
|
|
src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row1_8x16b);
|
|
src_row2_8x16b = _mm_unpacklo_epi64(src_row2_8x16b, src_row3_8x16b);
|
|
|
|
tmp_delta2_8x16b = _mm_srli_epi64(src_row0_8x16b, 16);
|
|
tmp_delta3_8x16b = _mm_srli_epi64(src_row2_8x16b, 16);
|
|
|
|
tmp_delta2_8x16b = _mm_shuffle_epi32(tmp_delta2_8x16b, 0x08);
|
|
tmp_delta3_8x16b = _mm_shuffle_epi32(tmp_delta3_8x16b, 0x08);
|
|
//arranged q31 q30 p30 p31 q21 q20 p20 p21 q1 q10 p10 p11 q01 q00 p00 p01
|
|
tmp_delta2_8x16b = _mm_unpacklo_epi64(tmp_delta2_8x16b, tmp_delta3_8x16b);
|
|
|
|
coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
|
|
// (-3q1+9q0),(-9p0+3p1)
|
|
tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
|
|
//converting to 16 bit
|
|
consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
|
|
//getting -tc store
|
|
tmp1_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
|
|
//calc 10 *tc = 2*tc +8*tc ; 2*tc
|
|
tmp2_const_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
|
|
//calc 10 *tc = 2*tc +8*tc ; 8*tc
|
|
tmp0_const_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
|
|
//getting -tc store
|
|
tmp3_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
|
|
//calc 10 *tc
|
|
tmp2_const_8x16b = _mm_add_epi16(tmp2_const_8x16b, tmp0_const_8x16b);
|
|
//const 1
|
|
const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
|
|
tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta3_8x16b, const2_8x16b);
|
|
const2_8x16b = _mm_srli_epi32(tmp1_const_8x16b, 31);
|
|
//getting the mask values
|
|
mask_pq_8x16b = _mm_loadl_epi64((__m128i *)(&mask1));
|
|
//loaded coef for delta1 calculation
|
|
coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
|
|
//(-2q1+q0),(p0-2p1)
|
|
tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
|
|
//const 8
|
|
const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
|
|
//rearranging the mask values
|
|
mask_pq_8x16b = _mm_unpacklo_epi64(mask_pq_8x16b, mask_pq_8x16b);
|
|
//normalisation of the filter
|
|
tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
|
|
tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
|
|
|
|
//getting deltaq0
|
|
tmp_delta2_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp1_const_8x16b);
|
|
//packing d3q d2q d1q d0q d3p d2p d1p d0p
|
|
tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_delta2_8x16b);
|
|
//absolute delta
|
|
tmp_delta2_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
|
|
//Clipping of delta0
|
|
tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
|
|
//mask for |delta| < 10*tc
|
|
tmp0_const_8x16b = _mm_cmpgt_epi16(tmp2_const_8x16b, tmp_delta2_8x16b);
|
|
//Clipping of delta0
|
|
tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp3_const_8x16b);
|
|
|
|
|
|
//delta 1 calc starts
|
|
|
|
//getting q32 q22 q12 q02 p32 p12 p22 p02
|
|
tmp2_const_8x16b = _mm_loadl_epi64((__m128i *)(shuffle0));
|
|
tmp_delta2_8x16b = _mm_shuffle_epi8(src_row0_8x16b, tmp2_const_8x16b);
|
|
tmp_delta1_8x16b = _mm_shuffle_epi8(src_row2_8x16b, tmp2_const_8x16b);
|
|
tmp_delta1_8x16b = _mm_unpacklo_epi32(tmp_delta2_8x16b, tmp_delta1_8x16b);
|
|
//constant 1
|
|
const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
|
|
//tc>>1 16 bit
|
|
consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
|
|
|
|
//getting -tc>>1 store 16 bit
|
|
tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
|
|
//2*delta0
|
|
tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
|
|
|
|
//getting all respective q's and p's together
|
|
tmp3_const_8x16b = _mm_load_si128((__m128i *)(shuffle1));
|
|
tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta3_8x16b, tmp3_const_8x16b);
|
|
//final adds for deltap1 and deltaq1
|
|
tmp_delta3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, const2_8x16b);
|
|
tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp2_const_8x16b);
|
|
tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp_delta3_8x16b);
|
|
tmp2_const_8x16b = _mm_setzero_si128();
|
|
tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
|
|
|
|
// clipping delta1
|
|
tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
|
|
// clipping delta1
|
|
tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
|
|
|
|
//getting the mask ready
|
|
mask_pq_8x16b = _mm_srai_epi16(mask_pq_8x16b, 15);
|
|
//masking of the delta values |delta|<10*tc
|
|
tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp0_const_8x16b);
|
|
tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp0_const_8x16b);
|
|
//packing dq1 dq0 dp0 dp1
|
|
tmp1_const_8x16b = _mm_unpacklo_epi16(tmp_delta1_8x16b, tmp_delta0_8x16b);
|
|
tmp_delta0_8x16b = _mm_unpackhi_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
|
|
tmp_delta1_8x16b = _mm_unpackhi_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
|
|
tmp_delta0_8x16b = _mm_unpacklo_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
|
|
|
|
//masking of the delta values dep, deq , filter_p ,filter_q
|
|
tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, mask_pq_8x16b);
|
|
tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, mask_pq_8x16b);
|
|
//converting 8bit to 16 bit
|
|
src_row0_8x16b = _mm_unpacklo_epi8(src_row0_8x16b, tmp2_const_8x16b);
|
|
src_row1_8x16b = _mm_unpacklo_epi8(src_row1_8x16b, tmp2_const_8x16b);
|
|
src_row2_8x16b = _mm_unpacklo_epi8(src_row2_8x16b, tmp2_const_8x16b);
|
|
src_row3_8x16b = _mm_unpacklo_epi8(src_row3_8x16b, tmp2_const_8x16b);
|
|
//shuffle values loaded
|
|
tmp0_const_8x16b = _mm_load_si128((__m128i *)shuffle2);
|
|
tmp1_const_8x16b = _mm_load_si128((__m128i *)shuffle3);
|
|
//arranging each row delta in different registers
|
|
tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp1_const_8x16b);
|
|
tmp_delta2_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp0_const_8x16b);
|
|
tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp1_const_8x16b);
|
|
tmp_delta0_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp0_const_8x16b);
|
|
|
|
//adding the respective delta
|
|
src_row3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, src_row3_8x16b);
|
|
src_row2_8x16b = _mm_add_epi16(tmp_delta2_8x16b, src_row2_8x16b);
|
|
src_row1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_row1_8x16b);
|
|
src_row0_8x16b = _mm_add_epi16(tmp_delta0_8x16b, src_row0_8x16b);
|
|
//saturating to 8 bit
|
|
src_row2_8x16b = _mm_packus_epi16(src_row2_8x16b, src_row3_8x16b);
|
|
src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row1_8x16b);
|
|
//separating different rows
|
|
src_row1_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
|
|
src_row3_8x16b = _mm_srli_si128(src_row2_8x16b, 8);
|
|
}
|
|
|
|
_mm_storel_epi64((__m128i *)(pu1_src - 4), src_row0_8x16b);
|
|
_mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), src_row1_8x16b);
|
|
_mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row2_8x16b);
|
|
_mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), src_row3_8x16b);
|
|
}
|
|
}
|
|
|
|
void ihevc_deblk_luma_horz_ssse3(UWORD8 *pu1_src,
|
|
WORD32 src_strd,
|
|
WORD32 bs,
|
|
WORD32 quant_param_p,
|
|
WORD32 quant_param_q,
|
|
WORD32 beta_offset_div2,
|
|
WORD32 tc_offset_div2,
|
|
WORD32 filter_flag_p,
|
|
WORD32 filter_flag_q)
|
|
{
|
|
WORD32 qp_luma, beta_indx, tc_indx;
|
|
WORD32 beta, tc;
|
|
|
|
WORD32 d0, d3, dp, dq, d;
|
|
WORD32 de_0, de_1, de_2, de_3;
|
|
WORD32 d_sam0, d_sam3;
|
|
WORD32 de, dep, deq;
|
|
|
|
__m128i src_q0_8x16b, src_q1_8x16b, src_p0_8x16b, src_p1_8x16b, src_q2_8x16b;
|
|
__m128i tmp_pq_str1_8x16b, src_p2_8x16b, tmp_pq_str0_8x16b;
|
|
|
|
|
|
|
|
|
|
{
|
|
__m128i src_tmp_p_0_8x16b, src_tmp_p_1_8x16b, src_tmp_q_0_8x16b, src_tmp_q_1_8x16b;
|
|
__m128i coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
|
|
__m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
|
|
|
|
ASSERT((bs > 0));
|
|
ASSERT(filter_flag_p || filter_flag_q);
|
|
|
|
qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
|
|
beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
|
|
|
|
/* BS based on implementation can take value 3 if it is intra/inter egde */
|
|
/* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
|
|
/* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */
|
|
/* the above desired functionallity is achieved by doing (2*(bs>>1)) */
|
|
|
|
tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
|
|
|
|
beta = gai4_ihevc_beta_table[beta_indx];
|
|
tc = gai4_ihevc_tc_table[tc_indx];
|
|
if(0 == tc)
|
|
{
|
|
return;
|
|
}
|
|
src_q0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src));
|
|
src_q1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
|
|
src_p0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
|
|
src_p1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
|
|
src_q2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd));
|
|
tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd));
|
|
src_p2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 3 * src_strd));
|
|
tmp_pq_str0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4 * src_strd));
|
|
|
|
|
|
src_tmp_p_0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
|
|
src_tmp_p_1_8x16b = _mm_unpacklo_epi8(tmp_pq_str0_8x16b, src_p2_8x16b);
|
|
|
|
src_tmp_q_0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
|
|
src_tmp_q_1_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
|
|
|
|
src_tmp_p_0_8x16b = _mm_unpacklo_epi16(src_tmp_p_1_8x16b, src_tmp_p_0_8x16b);
|
|
src_tmp_q_0_8x16b = _mm_unpacklo_epi16(src_tmp_q_0_8x16b, src_tmp_q_1_8x16b);
|
|
|
|
src_tmp_p_0_8x16b = _mm_shuffle_epi32(src_tmp_p_0_8x16b, 0x6c);
|
|
src_tmp_q_0_8x16b = _mm_shuffle_epi32(src_tmp_q_0_8x16b, 0x6c);
|
|
|
|
coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
|
|
mask_16x8b = _mm_load_si128((__m128i *)(shuffle_d));
|
|
|
|
src_tmp_p_0_8x16b = _mm_unpacklo_epi32(src_tmp_p_0_8x16b, src_tmp_q_0_8x16b);
|
|
//WORD32 shuffle_d[4]={0x80800403,0x80800c0b,0x03000704,0x0b080f0c};
|
|
mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_p_0_8x16b, mask_16x8b);
|
|
|
|
mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_p_0_8x16b, coef_8x16b);
|
|
|
|
|
|
//to get all 1's of 8 bit in (1)
|
|
temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_p_0_8x16b, src_tmp_p_0_8x16b);
|
|
temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
|
|
//accumulating values foe dp3 dq3 , dp0 dq0 values
|
|
mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
|
|
|
|
temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
|
|
// to get all 1,-1 sets of 16 bits in (0)
|
|
temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
|
|
//q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
|
|
mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
|
|
//to get 16 bit 1's
|
|
temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
|
|
|
|
|
|
// dq3 dp3 dq0 dp0
|
|
mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
|
|
mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
|
|
mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
|
|
// dq dp d3 d0
|
|
mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
|
|
//|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
|
|
mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
|
|
//|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
|
|
mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
|
|
|
|
///store back in a single variable
|
|
temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
|
|
temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
|
|
mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
|
|
|
|
d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
|
|
d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
|
|
dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
|
|
dq = _mm_cvtsi128_si32(mask_16x8b);
|
|
//getting d
|
|
d = d0 + d3;
|
|
|
|
///store back in a single variable
|
|
temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
|
|
temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
|
|
mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
|
|
|
|
de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
|
|
de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
|
|
de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
|
|
de_3 = _mm_cvtsi128_si32(mask_16x8b);
|
|
|
|
de = 0;
|
|
dep = 0;
|
|
deq = 0;
|
|
if(d < beta)
|
|
{
|
|
d_sam0 = 0;
|
|
if((2 * d0 < (beta >> 2))
|
|
&& (de_2 < (beta >> 3))
|
|
&& (de_0 < ((5 * tc + 1) >> 1)))
|
|
{
|
|
d_sam0 = 1;
|
|
}
|
|
|
|
d_sam3 = 0;
|
|
if((2 * d3 < (beta >> 2))
|
|
&& (de_3 < (beta >> 3))
|
|
&& de_1 < ((5 * tc + 1) >> 1))
|
|
{
|
|
d_sam3 = 1;
|
|
}
|
|
|
|
de = (d_sam0 & d_sam3) + 1;
|
|
dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
|
|
deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
|
|
if(tc <= 1)
|
|
{
|
|
dep = 0;
|
|
deq = 0;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
if(de != 0)
|
|
{
|
|
|
|
if(2 == de)
|
|
{
|
|
|
|
__m128i temp_pq0_str0_16x8b;
|
|
__m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
|
|
__m128i temp_pq2_str0_16x8b;
|
|
__m128i temp_str0_16x8b, temp_str1_16x8b;
|
|
__m128i const2_8x16b, const2tc_8x16b;
|
|
|
|
LWORD64 mask, tc2;
|
|
tc = tc << 1;
|
|
mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
|
|
tc2 = ((LWORD64)tc);
|
|
|
|
const2_8x16b = _mm_cmpeq_epi16(src_p1_8x16b, src_p1_8x16b);
|
|
//q'0-q'1-2 ,p'0-p'1-2
|
|
temp_pq0_str0_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
|
|
temp_str0_16x8b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
|
|
const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
|
|
//arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
|
|
temp_pq0_str0_16x8b = _mm_unpacklo_epi64(temp_pq0_str0_16x8b, temp_str0_16x8b);
|
|
|
|
const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
|
|
temp_pq0_str0_16x8b = _mm_maddubs_epi16(temp_pq0_str0_16x8b, const2_8x16b);
|
|
|
|
//q'1-2, p'1-2
|
|
temp_pq1_str0_16x8b = _mm_unpacklo_epi8(src_p0_8x16b, src_q0_8x16b);
|
|
temp_pq1_str1_16x8b = _mm_unpacklo_epi8(src_q1_8x16b, src_q2_8x16b);
|
|
temp_str1_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p2_8x16b);
|
|
// q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
|
|
temp_pq1_str0_16x8b = _mm_unpacklo_epi64(temp_pq1_str0_16x8b, temp_pq1_str0_16x8b);
|
|
// q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
|
|
temp_pq1_str1_16x8b = _mm_unpacklo_epi64(temp_str1_16x8b, temp_pq1_str1_16x8b);
|
|
|
|
temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
|
|
temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
|
|
|
|
//clipping mask design
|
|
temp_str1_16x8b = _mm_setzero_si128();
|
|
temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
|
|
const2tc_8x16b = _mm_loadl_epi64((__m128i *)(&tc2));
|
|
temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
|
|
const2tc_8x16b = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
|
|
|
|
//clipping mask design
|
|
temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
|
|
const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
|
|
//calculating Clipping MAX for all pixel values.
|
|
src_p0_8x16b = _mm_unpacklo_epi32(src_p0_8x16b, src_q0_8x16b);
|
|
src_q0_8x16b = _mm_unpacklo_epi32(src_p1_8x16b, src_q1_8x16b);
|
|
//for clipping calc
|
|
src_p1_8x16b = _mm_unpacklo_epi64(src_p0_8x16b, src_q0_8x16b);
|
|
//saving the unmodified data of q1 p1 q0 p0
|
|
src_q1_8x16b = _mm_unpackhi_epi64(src_p0_8x16b, src_q0_8x16b);
|
|
//CLIpping MAX and MIN for q1 p1 q0 p0
|
|
src_p0_8x16b = _mm_adds_epu8(src_p1_8x16b, const2tc_8x16b);
|
|
src_p1_8x16b = _mm_subs_epu8(src_p1_8x16b, const2tc_8x16b);
|
|
|
|
|
|
//q'2-q'0-2,p'2-p'0-2
|
|
tmp_pq_str0_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp_pq_str0_8x16b);
|
|
temp_pq2_str0_16x8b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
|
|
const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
|
|
//arranged q33 q32 q23 q22 q13 q12 q03 q02 p32 p33 p22 p23 p12 p13 p02 p03
|
|
temp_pq2_str0_16x8b = _mm_unpacklo_epi64(tmp_pq_str0_8x16b, temp_pq2_str0_16x8b);
|
|
src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, src_q2_8x16b);
|
|
temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_pq2_str0_16x8b, const2_8x16b);
|
|
|
|
//calculating Clipping MAX and MIN for p2 and q2 .
|
|
tmp_pq_str0_8x16b = _mm_adds_epu8(src_p2_8x16b, const2tc_8x16b);
|
|
tmp_pq_str1_8x16b = _mm_subs_epu8(src_p2_8x16b, const2tc_8x16b);
|
|
//q'0-q'1-2 ,p'0-p'1-2
|
|
temp_str0_16x8b = _mm_shuffle_epi32(temp_pq0_str0_16x8b, 0x4e);
|
|
temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, temp_str0_16x8b);
|
|
//q'1-2 p'1-2
|
|
temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
|
|
//to get 2 in 16 bit
|
|
const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
|
|
|
|
|
|
//q'1, p'1 (adding 2)
|
|
temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
|
|
//q'0-q'1,p'0-p'1
|
|
temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, const2_8x16b);
|
|
//q'2-q'1,p'2-p'1
|
|
temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
|
|
//q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
|
|
temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq0_str0_16x8b);
|
|
//q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
|
|
temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
|
|
|
|
//normalisation of all modified pixels
|
|
temp_pq0_str0_16x8b = _mm_srai_epi16(temp_pq0_str0_16x8b, 3);
|
|
temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
|
|
temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
|
|
//q'1 p'1 q'0 p'0
|
|
temp_pq0_str0_16x8b = _mm_packus_epi16(temp_pq0_str0_16x8b, temp_pq1_str0_16x8b);
|
|
temp_pq2_str0_16x8b = _mm_packus_epi16(temp_pq2_str0_16x8b, temp_pq2_str0_16x8b);
|
|
//pack with the unmodified data of q2 and p2
|
|
src_p2_8x16b = _mm_unpackhi_epi64(temp_pq2_str0_16x8b, src_p2_8x16b);
|
|
//Clipping MAX and MIN for q'1 p'1 q'0 p'0 and q'2 p'2
|
|
temp_pq0_str0_16x8b = _mm_min_epu8(temp_pq0_str0_16x8b, src_p0_8x16b);
|
|
src_p2_8x16b = _mm_min_epu8(src_p2_8x16b, tmp_pq_str0_8x16b);
|
|
temp_pq0_str0_16x8b = _mm_max_epu8(temp_pq0_str0_16x8b, src_p1_8x16b);
|
|
src_p2_8x16b = _mm_max_epu8(src_p2_8x16b, tmp_pq_str1_8x16b);
|
|
//Reshuffling q'1 p'1 q'0 p'0 along with unmodified data
|
|
src_p0_8x16b = _mm_unpacklo_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
|
|
src_p1_8x16b = _mm_unpackhi_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
|
|
src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0xd8);
|
|
src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
|
|
src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
|
|
src_q2_8x16b = _mm_srli_si128(src_p2_8x16b, 8);
|
|
|
|
_mm_storel_epi64((__m128i *)(pu1_src - 3 * src_strd), src_p2_8x16b);
|
|
_mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
|
|
_mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
|
|
_mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
|
|
_mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
|
|
_mm_storel_epi64((__m128i *)(pu1_src + 2 * src_strd), src_q2_8x16b);
|
|
|
|
|
|
}
|
|
|
|
else
|
|
{
|
|
|
|
__m128i tmp_delta0_8x16b, tmp_delta1_8x16b;
|
|
__m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b;
|
|
__m128i coefdelta_0_8x16b;
|
|
__m128i const2_8x16b, consttc_8x16b;
|
|
|
|
LWORD64 maskp0, maskp1, maskq0, maskq1;
|
|
maskp0 = (LWORD64)filter_flag_p;
|
|
maskq0 = (LWORD64)filter_flag_q;
|
|
maskp1 = (LWORD64)dep;
|
|
maskq1 = (LWORD64)deq;
|
|
consttc_8x16b = _mm_set1_epi32(tc);
|
|
|
|
tmp_delta0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
|
|
tmp_delta1_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
|
|
//arranged q31 q30 p30 p31 q21 q20 p20 p21 q1 q10 p10 p11 q01 q00 p00 p01
|
|
tmp_delta1_8x16b = _mm_unpacklo_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
|
|
|
|
coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
|
|
// (-3q1+9q0),(-9p0+3p1)
|
|
tmp_delta0_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
|
|
|
|
//getting -tc store
|
|
tmp2_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
|
|
|
|
//getting tc in 16 bit
|
|
consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
|
|
//calc 10 *tc = 2*tc +8*tc ; 2*tc
|
|
tmp_pq_str0_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
|
|
//calc 10 *tc = 2*tc +8*tc ; 8*tc
|
|
tmp_pq_str1_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
|
|
|
|
//const 1
|
|
const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
|
|
//calc 10 *tc
|
|
tmp_pq_str0_8x16b = _mm_add_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
|
|
//delta0 without normalisation and clipping
|
|
tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta0_8x16b, const2_8x16b);
|
|
|
|
const2_8x16b = _mm_srli_epi32(tmp2_const_8x16b, 31);
|
|
|
|
//loaded coef for delta1 calculation
|
|
coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
|
|
//(-2q1+q0),(p0-2p1)
|
|
tmp_delta1_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
|
|
//const 8
|
|
const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
|
|
|
|
//normalisation of the filter
|
|
tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
|
|
tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
|
|
|
|
//getting deltaq0
|
|
tmp_pq_str1_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp2_const_8x16b);
|
|
//getting -tc
|
|
tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
|
|
//packing d03q d02q d01q d0q d03p d02p d01p d00p
|
|
tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_pq_str1_8x16b);
|
|
//absolute delta
|
|
tmp_pq_str1_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
|
|
|
|
//Clipping of delta0
|
|
tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
|
|
//tc>>1 16 bit
|
|
consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
|
|
//Clipping of delta0
|
|
tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp1_const_8x16b);
|
|
|
|
//(-tc)>>1 16 bit
|
|
tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
|
|
//mask for |delta| < 10*tc
|
|
tmp_pq_str0_8x16b = _mm_cmpgt_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
|
|
//delta 1 calc starts
|
|
|
|
//getting q32 q22 q12 q02 p32 p12 p22 p02
|
|
tmp0_const_8x16b = _mm_setzero_si128();
|
|
src_q2_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp0_const_8x16b);
|
|
src_p2_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp0_const_8x16b);
|
|
src_p2_8x16b = _mm_unpacklo_epi64(src_p2_8x16b, src_q2_8x16b);
|
|
//constant 1
|
|
const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
|
|
//2*delta0
|
|
tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
|
|
//getting all respective q's and p's together
|
|
coefdelta_0_8x16b = _mm_load_si128((__m128i *)(shuffle1));
|
|
tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, coefdelta_0_8x16b);
|
|
//final adds for deltap1 and deltaq1
|
|
tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, const2_8x16b);
|
|
src_p2_8x16b = _mm_add_epi16(src_p2_8x16b, tmp2_const_8x16b);
|
|
tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_p2_8x16b);
|
|
tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
|
|
|
|
//mask0= (((LWORD64)filter_flag_q)<<63)| (((LWORD64)filter_flag_p)<<31);
|
|
tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq0)));
|
|
src_p2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp0)));
|
|
|
|
// src_p2_8x16b = _mm_set_epi32(filter_flag_q,filter_flag_p,filter_flag_q,filter_flag_p);
|
|
//mask1= (((LWORD64)(filter_flag_q&deq))<<63)|(((LWORD64)(filter_flag_p & dep))<<31);
|
|
src_q2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq1)));
|
|
coefdelta_0_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp1)));
|
|
|
|
src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, tmp_pq_str1_8x16b);
|
|
src_q2_8x16b = _mm_unpacklo_epi32(coefdelta_0_8x16b, src_q2_8x16b);
|
|
//src_q2_8x16b = _mm_set_epi32(deq,dep,deq,dep);
|
|
src_q2_8x16b = _mm_and_si128(src_q2_8x16b, src_p2_8x16b);
|
|
|
|
//rearranging the mask values
|
|
src_q2_8x16b = _mm_shuffle_epi32(src_q2_8x16b, 0x50);
|
|
src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0x50);
|
|
|
|
src_q2_8x16b = _mm_slli_epi32(src_q2_8x16b, 31);
|
|
src_p2_8x16b = _mm_slli_epi32(src_p2_8x16b, 31);
|
|
src_q2_8x16b = _mm_srai_epi32(src_q2_8x16b, 31);
|
|
src_p2_8x16b = _mm_srai_epi32(src_p2_8x16b, 31);
|
|
|
|
//combining mask delta1
|
|
tmp_pq_str1_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_q2_8x16b);
|
|
// clipping delta1
|
|
tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
|
|
//combining mask delat0
|
|
tmp_pq_str0_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_p2_8x16b);
|
|
// clipping delta1
|
|
tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
|
|
|
|
|
|
//masking of the delta values |delta|<10*tc
|
|
tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp_pq_str1_8x16b);
|
|
tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp_pq_str0_8x16b);
|
|
//separating p and q delta 0 and addinq p0 and q0
|
|
tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
|
|
tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
|
|
src_p0_8x16b = _mm_unpacklo_epi8(src_p0_8x16b, tmp0_const_8x16b);
|
|
src_q0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, tmp0_const_8x16b);
|
|
src_p0_8x16b = _mm_add_epi16(src_p0_8x16b, tmp_pq_str0_8x16b);
|
|
src_q0_8x16b = _mm_add_epi16(src_q0_8x16b, tmp_pq_str1_8x16b);
|
|
//separating p and q delta 0 and addinq p0 and q0
|
|
tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
|
|
tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
|
|
src_p1_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, tmp0_const_8x16b);
|
|
src_q1_8x16b = _mm_unpacklo_epi8(src_q1_8x16b, tmp0_const_8x16b);
|
|
src_p1_8x16b = _mm_add_epi16(src_p1_8x16b, tmp_pq_str0_8x16b);
|
|
src_q1_8x16b = _mm_add_epi16(src_q1_8x16b, tmp_pq_str1_8x16b);
|
|
//packing p1 q1 and p0 q0 to 8 bit
|
|
src_p1_8x16b = _mm_packus_epi16(src_p1_8x16b, src_q1_8x16b);
|
|
src_p0_8x16b = _mm_packus_epi16(src_p0_8x16b, src_q0_8x16b);
|
|
|
|
src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
|
|
src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
|
|
|
|
_mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
|
|
_mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
|
|
_mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
|
|
_mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void ihevc_deblk_chroma_vert_ssse3(UWORD8 *pu1_src,
|
|
WORD32 src_strd,
|
|
WORD32 quant_param_p,
|
|
WORD32 quant_param_q,
|
|
WORD32 qp_offset_u,
|
|
WORD32 qp_offset_v,
|
|
WORD32 tc_offset_div2,
|
|
WORD32 filter_flag_p,
|
|
WORD32 filter_flag_q)
|
|
{
|
|
WORD32 qp_indx_u, qp_chroma_u;
|
|
WORD32 qp_indx_v, qp_chroma_v;
|
|
WORD32 tc_indx_u, tc_u;
|
|
WORD32 tc_indx_v, tc_v;
|
|
|
|
__m128i src_row_0_16x8b, tmp_pxl_0_16x8b, src_row_2_16x8b, tmp_pxl_1_16x8b;
|
|
ASSERT(filter_flag_p || filter_flag_q);
|
|
|
|
/* chroma processing is done only if BS is 2 */
|
|
/* this function is assumed to be called only if BS is 2 */
|
|
qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
|
|
qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
|
|
|
|
qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
|
|
qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
|
|
|
|
tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
|
|
tc_u = gai4_ihevc_tc_table[tc_indx_u];
|
|
|
|
tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
|
|
tc_v = gai4_ihevc_tc_table[tc_indx_v];
|
|
|
|
if(0 == tc_u && 0 == tc_v)
|
|
{
|
|
return;
|
|
}
|
|
src_row_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
|
|
tmp_pxl_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd - 4));
|
|
src_row_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd - 4));
|
|
tmp_pxl_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd - 4));
|
|
|
|
{
|
|
LWORD64 mask_tc, mask_flag, mask;
|
|
__m128i delta_vu0_16x8b, delta_vu1_16x8b;
|
|
__m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
|
|
__m128i min_0_16x8b;
|
|
__m128i const_16x8b;
|
|
mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
|
|
mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
|
|
mask = 0xffff00000000ffffLL;
|
|
|
|
src_row_0_16x8b = _mm_unpacklo_epi64(src_row_0_16x8b, tmp_pxl_0_16x8b);
|
|
src_row_2_16x8b = _mm_unpacklo_epi64(src_row_2_16x8b, tmp_pxl_1_16x8b);
|
|
|
|
mask_16x8b = _mm_load_si128((__m128i *)(shuffle_uv));
|
|
// qv11 qu11 qv10 qu10 qv01 qu01 qv00 qu00 pv10 pu10 pv11 pu11 pv00 pu00 pv01 pu01
|
|
// qv31 qu31 qv30 qu30 qv21 qu21 qv20 qu20 pv30 pu30 pv31 pu31 pv20 pu20 pv21 pu21
|
|
delta_vu0_16x8b = _mm_shuffle_epi8(src_row_0_16x8b, mask_16x8b);
|
|
delta_vu1_16x8b = _mm_shuffle_epi8(src_row_2_16x8b, mask_16x8b);
|
|
|
|
tmp_pxl_0_16x8b = _mm_unpacklo_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
|
|
tmp_pxl_1_16x8b = _mm_unpackhi_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
|
|
// pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
|
|
// qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
|
|
delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
|
|
delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
|
|
|
|
delta_vu0_16x8b = _mm_maddubs_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
|
|
delta_vu1_16x8b = _mm_maddubs_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
|
|
|
|
//generating offset 4
|
|
const_16x8b = _mm_cmpeq_epi16(tmp_pxl_0_16x8b, tmp_pxl_0_16x8b);
|
|
// filter flag mask and tc mask
|
|
mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
|
|
mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
|
|
|
|
mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
|
|
mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
|
|
//-tc
|
|
min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
|
|
//converting const 1
|
|
const_16x8b = _mm_srli_epi16(const_16x8b, 15);
|
|
|
|
//filterp and filterq flag
|
|
mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
|
|
mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
|
|
|
|
//modified delta with a filter (1 -4 4 -1) available in 16 bit
|
|
delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
|
|
//converting const 4
|
|
const_16x8b = _mm_slli_epi16(const_16x8b, 2);
|
|
|
|
mask_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
|
|
//offset addition
|
|
delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
|
|
//eliminating q1
|
|
tmp_pxl_1_16x8b = _mm_slli_epi16(tmp_pxl_1_16x8b, 8);
|
|
|
|
const_16x8b = _mm_setzero_si128();
|
|
//filter after normalisation
|
|
delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
|
|
mask_16x8b = _mm_shuffle_epi32(mask_16x8b, 0x44);
|
|
|
|
//clipping MAX
|
|
delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
|
|
//getting p0 and eliminating p1
|
|
tmp_pxl_0_16x8b = _mm_srli_epi16(tmp_pxl_0_16x8b, 8);
|
|
//clipping MIN
|
|
delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
|
|
//getting q0
|
|
tmp_pxl_1_16x8b = _mm_srli_epi16(tmp_pxl_1_16x8b, 8);
|
|
//masking filter flag
|
|
delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
|
|
delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
|
|
|
|
// q-delta ,p+delta
|
|
tmp_pxl_1_16x8b = _mm_sub_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
|
|
tmp_pxl_0_16x8b = _mm_add_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
|
|
//merging q0 and p0 of respective rows
|
|
delta_vu1_16x8b = _mm_unpackhi_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
|
|
delta_vu0_16x8b = _mm_unpacklo_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
|
|
// row 0 and row 1 packed , row2 and row3 packed
|
|
delta_vu0_16x8b = _mm_packus_epi16(delta_vu0_16x8b, const_16x8b);
|
|
delta_vu1_16x8b = _mm_packus_epi16(delta_vu1_16x8b, const_16x8b);
|
|
//removing older pixel values
|
|
src_row_0_16x8b = _mm_and_si128(src_row_0_16x8b, mask_16x8b);
|
|
src_row_2_16x8b = _mm_and_si128(src_row_2_16x8b, mask_16x8b);
|
|
//arranging modified pixels
|
|
delta_vu0_16x8b = _mm_shuffle_epi32(delta_vu0_16x8b, 0xd8);
|
|
delta_vu1_16x8b = _mm_shuffle_epi32(delta_vu1_16x8b, 0xd8);
|
|
delta_vu0_16x8b = _mm_slli_epi64(delta_vu0_16x8b, 16);
|
|
delta_vu1_16x8b = _mm_slli_epi64(delta_vu1_16x8b, 16);
|
|
//plugging the modified values
|
|
src_row_0_16x8b = _mm_or_si128(src_row_0_16x8b, delta_vu0_16x8b);
|
|
src_row_2_16x8b = _mm_or_si128(src_row_2_16x8b, delta_vu1_16x8b);
|
|
|
|
|
|
//geting values for row1 and row 3
|
|
tmp_pxl_0_16x8b = _mm_srli_si128(src_row_0_16x8b, 8);
|
|
tmp_pxl_1_16x8b = _mm_srli_si128(src_row_2_16x8b, 8);
|
|
|
|
_mm_storel_epi64((__m128i *)(pu1_src - 4), src_row_0_16x8b);
|
|
_mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), tmp_pxl_0_16x8b);
|
|
_mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row_2_16x8b);
|
|
_mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), tmp_pxl_1_16x8b);
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
void ihevc_deblk_chroma_horz_ssse3(UWORD8 *pu1_src,
|
|
WORD32 src_strd,
|
|
WORD32 quant_param_p,
|
|
WORD32 quant_param_q,
|
|
WORD32 qp_offset_u,
|
|
WORD32 qp_offset_v,
|
|
WORD32 tc_offset_div2,
|
|
WORD32 filter_flag_p,
|
|
WORD32 filter_flag_q)
|
|
{
|
|
WORD32 qp_indx_u, qp_chroma_u;
|
|
WORD32 qp_indx_v, qp_chroma_v;
|
|
WORD32 tc_indx_u, tc_u;
|
|
WORD32 tc_indx_v, tc_v;
|
|
|
|
|
|
__m128i tmp_p0_16x8b, src_p0_16x8b, src_q0_16x8b, tmp_q0_16x8b;
|
|
|
|
ASSERT(filter_flag_p || filter_flag_q);
|
|
|
|
/* chroma processing is done only if BS is 2 */
|
|
/* this function is assumed to be called only if BS is 2 */
|
|
qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
|
|
qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
|
|
|
|
qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
|
|
qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
|
|
|
|
tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
|
|
tc_u = gai4_ihevc_tc_table[tc_indx_u];
|
|
|
|
tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
|
|
tc_v = gai4_ihevc_tc_table[tc_indx_v];
|
|
|
|
if(0 == tc_u && 0 == tc_v)
|
|
{
|
|
return;
|
|
}
|
|
tmp_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
|
|
src_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
|
|
src_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
|
|
tmp_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
|
|
|
|
{
|
|
LWORD64 mask_tc, mask_flag;
|
|
__m128i delta_vu0_16x8b, delta_vu1_16x8b;
|
|
__m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
|
|
__m128i min_0_16x8b;
|
|
__m128i const_16x8b;
|
|
mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
|
|
mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
|
|
|
|
tmp_p0_16x8b = _mm_unpacklo_epi8(tmp_p0_16x8b, src_p0_16x8b);
|
|
tmp_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, tmp_q0_16x8b);
|
|
|
|
// pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
|
|
// qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
|
|
delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
|
|
delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
|
|
|
|
delta_vu0_16x8b = _mm_maddubs_epi16(tmp_p0_16x8b, delta_vu0_16x8b);
|
|
delta_vu1_16x8b = _mm_maddubs_epi16(tmp_q0_16x8b, delta_vu1_16x8b);
|
|
|
|
|
|
// filter flag mask and tc mask
|
|
mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
|
|
mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
|
|
|
|
//generating offset 4
|
|
const_16x8b = _mm_cmpeq_epi16(tmp_p0_16x8b, tmp_p0_16x8b);
|
|
// filter flag mask and tc mask
|
|
mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
|
|
mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
|
|
//-tc
|
|
min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
|
|
//converting const 1
|
|
const_16x8b = _mm_srli_epi16(const_16x8b, 15);
|
|
|
|
//filterp
|
|
mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
|
|
|
|
|
|
//converting const 4
|
|
const_16x8b = _mm_slli_epi16(const_16x8b, 2);
|
|
//modified delta with a filter (1 -4 4 -1) available in 16 bit
|
|
delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
|
|
|
|
//filterq flag
|
|
mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
|
|
//offset addition
|
|
delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
|
|
mask_16x8b = _mm_setzero_si128();
|
|
//filter after normalisation
|
|
delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
|
|
|
|
//converting p0 to 16bit
|
|
src_p0_16x8b = _mm_unpacklo_epi8(src_p0_16x8b, mask_16x8b);
|
|
//clipping MAX
|
|
delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
|
|
//converting q0 to 16bit
|
|
src_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, mask_16x8b);
|
|
//clipping MIN
|
|
delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
|
|
|
|
//masking filter flag
|
|
delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
|
|
delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
|
|
|
|
// q-delta ,p+delta
|
|
src_q0_16x8b = _mm_sub_epi16(src_q0_16x8b, delta_vu1_16x8b);
|
|
src_p0_16x8b = _mm_add_epi16(src_p0_16x8b, delta_vu0_16x8b);
|
|
|
|
// p0 and q0 packed
|
|
src_q0_16x8b = _mm_packus_epi16(src_q0_16x8b, mask_16x8b);
|
|
src_p0_16x8b = _mm_packus_epi16(src_p0_16x8b, mask_16x8b);
|
|
|
|
|
|
|
|
_mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_16x8b);
|
|
_mm_storel_epi64((__m128i *)(pu1_src), src_q0_16x8b);
|
|
|
|
}
|
|
|
|
|
|
}
|