/****************************************************************************** * * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** ******************************************************************************* * @file * ihevce_scan_coeffs_neon.c * * @brief * Contains definitions for scanning quantized tu * * @author * Ittiam * * @par List of Functions: * * @remarks * None * ******************************************************************************** */ /*****************************************************************************/ /* File Includes */ /*****************************************************************************/ /* System include files */ #include #include #include #include #include /* User include files */ #include "ihevc_typedefs.h" #include "itt_video_api.h" #include "ihevc_defs.h" #include "ihevc_debug.h" #include "ihevce_api.h" #include "ihevce_defs.h" #include "rc_cntrl_param.h" #include "rc_frame_info_collector.h" #include "rc_look_ahead_params.h" #include "ihevce_lap_enc_structs.h" #include "ihevc_platform_macros.h" #include "ihevc_structs.h" #include "ihevce_multi_thrd_structs.h" #include "ihevc_deblk.h" #include "ihevc_itrans_recon.h" #include "ihevc_chroma_itrans_recon.h" #include "ihevc_chroma_intra_pred.h" #include "ihevc_intra_pred.h" #include "ihevc_inter_pred.h" #include "ihevc_mem_fns.h" #include "ihevc_padding.h" #include "ihevc_weighted_pred.h" #include "ihevc_sao.h" #include "ihevc_resi_trans.h" #include "ihevc_quant_iquant_ssd.h" #include "ihevce_function_selector.h" #include "ihevce_me_common_defs.h" #include "ihevce_enc_structs.h" #include "ihevce_global_tables.h" #include "ihevce_ipe_instr_set_router.h" #include "ihevce_common_utils.h" /*****************************************************************************/ /* Function Declarations */ /*****************************************************************************/ FT_SCAN_COEFFS ihevce_scan_coeffs_neon; /*****************************************************************************/ /* Function Definitions */ /*****************************************************************************/ static WORD32 movemask_neon(uint8x16_t input) { const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 }; uint8x8_t mask_and = vdup_n_u8(0x80); int8x8_t mask_shift = vld1_s8(xr); uint8x8_t lo = vget_low_u8(input); uint8x8_t hi = vget_high_u8(input); lo = vand_u8(lo, mask_and); lo = vshl_u8(lo, mask_shift); hi = vand_u8(hi, mask_and); hi = vshl_u8(hi, mask_shift); lo = vpadd_u8(lo, lo); lo = vpadd_u8(lo, lo); lo = vpadd_u8(lo, lo); hi = vpadd_u8(hi, hi); hi = vpadd_u8(hi, hi); hi = vpadd_u8(hi, hi); return ((hi[0] << 8) | (lo[0] & 0xFF)); } WORD32 ihevce_scan_coeffs_neon( WORD16 *pi2_quant_coeffs, WORD32 *pi4_subBlock2csbfId_map, WORD32 scan_idx, WORD32 trans_size, UWORD8 *pu1_out_data, UWORD8 *pu1_csbf_buf, WORD32 i4_csbf_stride) { WORD32 i, trans_unit_idx, num_gt1_flag, num_gt0_flag; UWORD16 u2_csbf0flags; WORD32 num_bytes = 0; UWORD8 *pu1_trans_table; UWORD8 *pu1_csb_table; WORD32 shift_value, mask_value; WORD32 blk_row, blk_col; WORD32 x_pos, y_pos; WORD32 quant_coeff; UWORD8 *pu1_out_data_header; UWORD16 *pu2_out_data_coeff; int8x16_t one, shuffle, zero; int16x8_t ones; int8x8x2_t quant; (void)i4_csbf_stride; pu1_out_data_header = pu1_out_data; u2_csbf0flags = 0xBAD0; pu1_csb_table = (UWORD8 *)&(g_u1_scan_table_4x4[scan_idx][0]); GETRANGE(shift_value, trans_size); shift_value = shift_value - 3; mask_value = (trans_size / 4) - 1; switch(trans_size) { case 32: pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_8x8[scan_idx][0]); break; case 16: pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_4x4[scan_idx][0]); break; case 8: pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_2x2[scan_idx][0]); break; case 4: pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_1x1[0]); break; } shuffle = vld1q_s8((WORD8 *)pu1_csb_table); zero = vdupq_n_s8(0); one = vdupq_n_s8(1); ones = vdupq_n_s16(1); for(trans_unit_idx = (trans_size * trans_size / 16) - 1; trans_unit_idx >= 0; trans_unit_idx--) { if(pu1_csbf_buf[pi4_subBlock2csbfId_map[pu1_trans_table[trans_unit_idx]]]) { WORD32 sig_coeff_abs_gt0_flags, sig_coeff_abs_gt1_flags; WORD32 sign_flag, pos_last_coded; UWORD8 u1_last_x, u1_last_y; WORD16 *pi2_temp_quant_coeff = pi2_quant_coeffs; int16x4_t quant0, quant1, quant2, quant3; int16x8_t quant01, quant23; int8x8_t a, b, c, d, shuffle_0, shuffle_1; int8x16_t shuffle_out, shuffle_out_abs; uint8x16_t sign, eq0, eq1; blk_row = pu1_trans_table[trans_unit_idx] >> shift_value; blk_col = pu1_trans_table[trans_unit_idx] & mask_value; pi2_temp_quant_coeff += (blk_col * 4 + (blk_row * 4) * trans_size); quant0 = vld1_s16(pi2_temp_quant_coeff + 0 * trans_size); quant1 = vld1_s16(pi2_temp_quant_coeff + 1 * trans_size); quant2 = vld1_s16(pi2_temp_quant_coeff + 2 * trans_size); quant3 = vld1_s16(pi2_temp_quant_coeff + 3 * trans_size); quant01 = vcombine_s16(quant0, quant1); quant23 = vcombine_s16(quant2, quant3); a = vqmovn_s16(quant01); b = vqmovn_s16(quant23); quant.val[0] = a; quant.val[1] = b; c = vget_low_s8(shuffle); d = vget_high_s8(shuffle); shuffle_0 = vtbl2_s8(quant, c); shuffle_1 = vtbl2_s8(quant, d); shuffle_out = vcombine_s8(shuffle_0, shuffle_1); shuffle_out_abs = vabsq_s8(shuffle_out); sign = vcgtq_s8(zero, shuffle_out); eq0 = vceqq_s8(shuffle_out, zero); eq1 = vceqq_s8(shuffle_out_abs, one); sign_flag = movemask_neon(sign); sig_coeff_abs_gt0_flags = movemask_neon(eq0); sig_coeff_abs_gt1_flags = movemask_neon(eq1); sig_coeff_abs_gt0_flags = ~sig_coeff_abs_gt0_flags; sig_coeff_abs_gt1_flags = ~sig_coeff_abs_gt1_flags; sig_coeff_abs_gt0_flags = sig_coeff_abs_gt0_flags & 0x0000FFFF; sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags & sig_coeff_abs_gt0_flags; ASSERT(sig_coeff_abs_gt0_flags != 0); GET_POS_MSB_32(pos_last_coded, sig_coeff_abs_gt0_flags); /* Update gt1 flag based on num_gt0_flag */ num_gt0_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags); /* Find the position of 9th(MAX_GT_ONE+1) 1 in sig_coeff_abs_gt0_flags from MSB and update gt1 flag */ if(num_gt0_flag > MAX_GT_ONE) { WORD32 gt0_first_byte = sig_coeff_abs_gt0_flags & 0xFF; WORD32 num_gt0_second_byte = ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags & 0xFF00); WORD32 pos_nineth_one; /* pos. of 9th one from MSB of sig_coeff_abs_gt0_flags */ WORD32 gt0_after_nineth_one, num_gt0_first_byte_to_nine; num_gt0_first_byte_to_nine = (MAX_GT_ONE + 1) - num_gt0_second_byte; while(num_gt0_first_byte_to_nine) { GET_POS_MSB_32(pos_nineth_one, gt0_first_byte); gt0_first_byte = CLEAR_BIT( gt0_first_byte, pos_nineth_one); /*gt0_second_byte &= (~(0x1<> 2) + blk_row * 4; num_gt1_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt1_flags); /* storing last_x and last_y */ *pu1_out_data_header = u1_last_x; pu1_out_data_header++; *pu1_out_data_header = u1_last_y; pu1_out_data_header++; /* storing the scan order */ *pu1_out_data_header = (UWORD8)scan_idx; pu1_out_data_header++; /* storing last_sub_block pos. in scan order count */ *pu1_out_data_header = (UWORD8)trans_unit_idx; pu1_out_data_header++; /*stored the first 4 bytes, now all are word16. So word16 pointer*/ pu2_out_data_coeff = (UWORD16 *)pu1_out_data_header; /* u2_csbf0flags word */ u2_csbf0flags = 0xBAD0 | 1; /*since right&bottom csbf is 0*/ /* storing u2_csbf0flags word */ *pu2_out_data_coeff = u2_csbf0flags; pu2_out_data_coeff++; /* storing u2_sig_coeff_abs_gt0_flags 2 bytes */ *pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt0_flags; pu2_out_data_coeff++; /* storing u2_sig_coeff_abs_gt1_flags 2 bytes */ *pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt1_flags; pu2_out_data_coeff++; /* storing u2_sign_flags 2 bytes */ *pu2_out_data_coeff = (UWORD16)sign_flag; pu2_out_data_coeff++; /* Store the u2_abs_coeff_remaining[] */ for(i = 0; i < num_gt1_flag; i++) { volatile WORD32 bit_pos; ASSERT(sig_coeff_abs_gt1_flags != 0); GET_POS_MSB_32(bit_pos, sig_coeff_abs_gt1_flags); sig_coeff_abs_gt1_flags = CLEAR_BIT( sig_coeff_abs_gt1_flags, bit_pos); /*sig_coeff_abs_gt1_flags &= (~(0x1<> 2); quant_coeff = pi2_temp_quant_coeff[x_pos + (y_pos * trans_size)]; /* storing u2_abs_coeff_remaining[i] 2 bytes */ *pu2_out_data_coeff = (UWORD16)abs(quant_coeff) - 1; pu2_out_data_coeff++; } break; /*We just need this loop for finding 1st non-zero csb only*/ } } /* go through remaining csb in the scan order */ for(trans_unit_idx = trans_unit_idx - 1; trans_unit_idx >= 0; trans_unit_idx--) { blk_row = pu1_trans_table[trans_unit_idx] >> shift_value; /*row of csb*/ blk_col = pu1_trans_table[trans_unit_idx] & mask_value; /*col of csb*/ /* u2_csbf0flags word */ u2_csbf0flags = 0xBAD0 | /* assuming csbf_buf has only 0 or 1 values */ (pu1_csbf_buf[pi4_subBlock2csbfId_map[pu1_trans_table[trans_unit_idx]]]); /********************************************************************/ /* Minor hack: As per HEVC spec csbf in not signalled in stream for */ /* block0, instead sig coeff map is directly signalled. This is */ /* taken care by forcing csbf for block0 to be 1 even if it is 0 */ /********************************************************************/ if(0 == trans_unit_idx) { u2_csbf0flags |= 1; } if((blk_col + 1 < trans_size / 4)) /* checking right boundary */ { if(pu1_csbf_buf[pi4_subBlock2csbfId_map[blk_row * trans_size / 4 + blk_col + 1]]) { /* set the 2nd bit of u2_csbf0flags for right csbf */ u2_csbf0flags = u2_csbf0flags | (1 << 1); } } if((blk_row + 1 < trans_size / 4)) /* checking bottom oundary */ { if(pu1_csbf_buf[pi4_subBlock2csbfId_map[(blk_row + 1) * trans_size / 4 + blk_col]]) { /* set the 3rd bit of u2_csbf0flags for bottom csbf */ u2_csbf0flags = u2_csbf0flags | (1 << 2); } } /* storing u2_csbf0flags word */ *pu2_out_data_coeff = u2_csbf0flags; pu2_out_data_coeff++; /* check for the csb flag in our scan order */ if(u2_csbf0flags & 0x1) { WORD32 sig_coeff_abs_gt0_flags, sig_coeff_abs_gt1_flags; WORD32 sign_flag; int16x4_t quant0, quant1, quant2, quant3; int16x8_t quant01, quant23; int8x8_t a, b, c, d, shuffle_0, shuffle_1; int8x16_t shuffle_out, shuffle_out_abs; uint8x16_t sign, eq0, eq1; /* x_pos=blk_col*4, y_pos=blk_row*4 */ WORD16 *pi2_temp_quant_coeff = pi2_quant_coeffs + blk_col * 4 + (blk_row * 4) * trans_size; /* Load Quant Values */ quant0 = vld1_s16(pi2_temp_quant_coeff + 0 * trans_size); quant1 = vld1_s16(pi2_temp_quant_coeff + 1 * trans_size); quant2 = vld1_s16(pi2_temp_quant_coeff + 2 * trans_size); quant3 = vld1_s16(pi2_temp_quant_coeff + 3 * trans_size); /* Two quant rows together */ quant01 = vcombine_s16(quant0, quant1); quant23 = vcombine_s16(quant2, quant3); /* All 4 rows: For sign, gt0, gt1 flags, even 8 bit version is enough! */ a = vqmovn_s16(quant01); b = vqmovn_s16(quant23); quant.val[0] = a; quant.val[1] = b; c = vget_low_s8(shuffle); d = vget_high_s8(shuffle); shuffle_0 = vtbl2_s8(quant, c); shuffle_1 = vtbl2_s8(quant, d); shuffle_out = vcombine_s8(shuffle_0, shuffle_1); /* ABS values */ shuffle_out_abs = vabsq_s8(shuffle_out); /* sign bits : Will get 0xFF if (0 > shuffle_out) */ sign = vcgtq_s8(zero, shuffle_out); /* gt0 : Will get 0xFF if ( shuffle_out == 0 ) */ eq0 = vceqq_s8(shuffle_out, zero); /* gt1 : Will get 0xFF if ( abs(shuffle_out) == 1 ) */ eq1 = vceqq_s8(shuffle_out_abs, one); /* movemask:0 extended upper 16bits,Only low16 bits are required while storing */ sign_flag = movemask_neon(sign); sig_coeff_abs_gt0_flags = movemask_neon(eq0); sig_coeff_abs_gt1_flags = movemask_neon(eq1); /* Update gt0 and gt1 based on ==0 and ==1 flag */ sig_coeff_abs_gt0_flags = ~sig_coeff_abs_gt0_flags; /* != 0 */ sig_coeff_abs_gt1_flags = ~sig_coeff_abs_gt1_flags; /* (abs) != 1 */ sig_coeff_abs_gt0_flags = sig_coeff_abs_gt0_flags & 0x0000FFFF; /* Clear high Word */ sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags & sig_coeff_abs_gt0_flags; /* Update gt1 flag based on num_gt0_flag */ num_gt0_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags); /* Find the position of 9th(MAX_GT_ONE+1) 1 in sig_coeff_abs_gt0_flags from MSB and update gt1 flag */ if(num_gt0_flag > MAX_GT_ONE) { WORD32 gt0_first_byte = sig_coeff_abs_gt0_flags & 0xFF; WORD32 num_gt0_second_byte = ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags & 0xFF00); WORD32 pos_nineth_one; /* pos. of 9th one from MSB of sig_coeff_abs_gt0_flags */ WORD32 gt0_after_nineth_one, num_gt0_first_byte_to_nine; num_gt0_first_byte_to_nine = (MAX_GT_ONE + 1) - num_gt0_second_byte; while(num_gt0_first_byte_to_nine) { GET_POS_MSB_32(pos_nineth_one, gt0_first_byte); gt0_first_byte = CLEAR_BIT( gt0_first_byte, pos_nineth_one); /*gt0_second_byte &= (~(0x1<> 2); quant_coeff = pi2_temp_quant_coeff[x_pos + (y_pos * trans_size)]; /* storing u2_abs_coeff_remaining[i] 2 bytes */ *pu2_out_data_coeff = (UWORD16)abs(quant_coeff) - 1; pu2_out_data_coeff++; } } } num_bytes = (UWORD8 *)pu2_out_data_coeff - pu1_out_data; return num_bytes; /* Return the number of bytes written to out_data */ }