You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1021 lines
52 KiB

//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
// *******************************************************************************
// * @file
// * ih264e_half_pel.s
// *
// * @brief
// *
// *
// * @author
// * Ittiam
// *
// * @par List of Functions:
// * ih264e_sixtapfilter_horz
// * ih264e_sixtap_filter_2dvh_vert
//
// *
// * @remarks
// * None
// *
// *******************************************************************************
// */
.text
.p2align 2
.include "ih264_neon_macros.s"
///*******************************************************************************
//*
//* @brief
//* Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16)
//*
//* @par Description:
//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
//* sec 8.4.2.2.1 titled "Luma sample interpolation process"
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[out] pu1_dst
//* UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] dst_strd
//* integer destination stride
//*
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
//void ih264e_sixtapfilter_horz(UWORD8 *pu1_src,
// UWORD8 *pu1_dst,
// WORD32 src_strd,
// WORD32 dst_strd);
.equ halfpel_width , 17 + 1 //( make it even, two rows are processed at a time)
.global ih264e_sixtapfilter_horz_av8
ih264e_sixtapfilter_horz_av8:
// STMFD sp!,{x14}
push_v_regs
sxtw x2, w2
sxtw x3, w3
stp x19, x20, [sp, #-16]!
movi v0.8b, #5
sub x0, x0, #2
sub x3, x3, #16
movi v1.8b, #20
mov x14, #16
filter_horz_loop:
ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0
ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1
//// Processing row0 and row1
ext v31.8b, v2.8b , v3.8b , #5
ext v30.8b, v3.8b , v4.8b , #5
uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0)
ext v29.8b, v4.8b , v4.8b , #5
uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0)
ext v28.8b, v5.8b , v6.8b , #5
uaddl v12.8h, v29.8b, v4.8b //// a0 + a5 (column3,row0)
ext v27.8b, v6.8b , v7.8b , #5
uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1)
ext v26.8b, v7.8b , v7.8b , #5
uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1)
ext v31.8b, v2.8b , v3.8b , #2
uaddl v18.8h, v26.8b, v7.8b //// a0 + a5 (column3,row1)
ext v30.8b, v3.8b , v4.8b , #2
umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
ext v29.8b, v4.8b , v4.8b , #2
umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
ext v28.8b, v5.8b , v6.8b , #2
umlal v12.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
ext v27.8b, v6.8b , v7.8b , #2
umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1)
ext v26.8b, v7.8b , v7.8b , #2
umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1)
ext v31.8b, v2.8b , v3.8b , #3
umlal v18.8h, v26.8b, v1.8b //// a0 + a5 + 20a2 (column3,row1)
ext v30.8b, v3.8b , v4.8b , #3
umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
ext v29.8b, v4.8b , v4.8b , #3
umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
ext v28.8b, v5.8b , v6.8b , #3
umlal v12.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
ext v27.8b, v6.8b , v7.8b , #3
umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1)
ext v26.8b, v7.8b , v7.8b , #3
umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1)
ext v31.8b, v2.8b , v3.8b , #1
umlal v18.8h, v26.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row1)
ext v30.8b, v3.8b , v4.8b , #1
umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
ext v29.8b, v4.8b , v4.8b , #1
umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
ext v28.8b, v5.8b , v6.8b , #1
umlsl v12.8h, v29.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
ext v27.8b, v6.8b , v7.8b , #1
umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
ext v26.8b, v7.8b , v7.8b , #1
umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1)
ext v31.8b, v2.8b , v3.8b , #4
umlsl v18.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row1)
ext v30.8b, v3.8b , v4.8b , #4
umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
ext v29.8b, v4.8b , v4.8b , #4
umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
ext v28.8b, v5.8b , v6.8b , #4
umlsl v12.8h, v29.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
ext v27.8b, v6.8b , v7.8b , #4
umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
ext v26.8b, v7.8b , v7.8b , #4
umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1)
umlsl v18.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row1)
sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
sqrshrun v22.8b, v12.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1)
sqrshrun v25.8b, v18.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row1)
st1 {v20.8b, v21.8b}, [x1], #16 ////Store dest row0
st1 {v22.h}[0], [x1], x3
st1 {v23.8b, v24.8b}, [x1], #16 ////Store dest row1
st1 {v25.h}[0], [x1], x3
subs x14, x14, #2 // decrement counter
bne filter_horz_loop
// LDMFD sp!,{pc}
ldp x19, x20, [sp], #16
pop_v_regs
ret
///**
//*******************************************************************************
//*
//* @brief
//* This function implements a two stage cascaded six tap filter. It
//* applies the six tap filter in the vertical direction on the
//* predictor values, followed by applying the same filter in the
//* horizontal direction on the output of the first stage. The six tap
//* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
//* interpolation process"
//* (Filter run for width = 17 and height =17)
//* @par Description:
//* The function interpolates
//* the predictors first in the vertical direction and then in the
//* horizontal direction to output the (1/2,1/2). The output of the first
//* stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C)
//* in 16 bit precision.
//*
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[out] pu1_dst1
//* UWORD8 pointer to the destination(vertical filtered output)
//*
//* @param[out] pu1_dst2
//* UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output)
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] dst_strd
//* integer destination stride of pu1_dst
//*
//* @param[in]pi16_pred1
//* Pointer to 16bit intermediate buffer(used only in c)
//*
//* @param[in] pi16_pred1_strd
//* integer destination stride of pi16_pred1
//*
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
//void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src,
// UWORD8 *pu1_dst1,
// UWORD8 *pu1_dst2,
// WORD32 src_strd,
// WORD32 dst_strd,
// WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/
// WORD32 pi16_pred1_strd)
.global ih264e_sixtap_filter_2dvh_vert_av8
ih264e_sixtap_filter_2dvh_vert_av8:
// STMFD sp!,{x10,x11,x12,x14}
push_v_regs
sxtw x3, w3
sxtw x4, w4
stp x19, x20, [sp, #-16]!
////x0 - pu1_ref
////x3 - u4_ref_width
//// Load six rows for vertical interpolation
lsl x12, x3, #1
sub x0, x0, x12
sub x0, x0, #2
ld1 {v2.8b, v3.8b, v4.8b}, [x0], x3
ld1 {v5.8b, v6.8b, v7.8b}, [x0], x3
ld1 {v8.8b, v9.8b, v10.8b}, [x0], x3
mov x12, #5
ld1 {v11.8b, v12.8b, v13.8b}, [x0], x3
mov x14, #20
ld1 {v14.8b, v15.8b, v16.8b}, [x0], x3
mov v0.h[0], w12
mov v0.h[1], w14
ld1 {v17.8b, v18.8b, v19.8b}, [x0], x3
movi v1.8b, #20
//// x12 - u2_buff1_width
//// x14 - u2_buff2_width
mov x12, x4
add x11, x1, #16
mov x14, x12
mov x10, #3 //loop counter
sub x16 , x12, #8
sub x19, x14, #16
filter_2dvh_loop:
//// ////////////// ROW 1 ///////////////////////
//// Process first vertical interpolated row
//// each column is
uaddl v20.8h, v2.8b, v17.8b //// a0 + a5 (column1,row0)
movi v31.8b, #5
umlal v20.8h, v8.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
umlal v20.8h, v11.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
umlsl v20.8h, v5.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
umlsl v20.8h, v14.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
mov v21.d[0], v20.d[1]
uaddl v22.8h, v3.8b, v18.8b //// a0 + a5 (column2,row0)
umlal v22.8h, v9.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
umlal v22.8h, v12.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
umlsl v22.8h, v6.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
umlsl v22.8h, v15.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
ext v30.8b, v20.8b , v21.8b , #4
mov v23.d[0], v22.d[1]
uaddl v24.8h, v4.8b, v19.8b //// a0 + a5 (column3,row0)
ext v29.8b, v20.8b , v21.8b , #6
umlal v24.8h, v10.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
umlal v24.8h, v13.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
umlsl v24.8h, v7.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
umlsl v24.8h, v16.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
mov v25.d[0], v24.d[1]
sqrshrun v2.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
ext v31.8b, v21.8b , v22.8b , #2
sqrshrun v3.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
ext v28.8b, v20.8b , v21.8b , #2
saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1)
ext v31.8b, v22.8b , v23.8b , #2
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1)
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1)
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
ext v30.8b, v21.8b , v22.8b , #4
sqrshrun v4.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
ext v29.8b, v21.8b , v22.8b , #6
ext v28.8b, v21.8b , v22.8b , #2
saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2)
smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2)
smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2)
smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
ext v31.8b, v23.8b , v24.8b , #2
mov v21.d[0], v20.d[1]
ext v2.8b, v2.8b , v3.8b , #2
ext v3.8b, v3.8b , v4.8b , #2
ext v4.8b, v4.8b , v4.8b , #2
st1 {v2.8b, v3.8b}, [x1], x12 //// store row1 - 1,1/2 grid
st1 {v4.h}[0], [x11], x12 //// store row1 - 1,1/2 grid
ext v30.8b, v22.8b , v23.8b , #4
ext v29.8b, v22.8b , v23.8b , #6
saddl v2.4s, v31.4h, v22.4h //// a0 + a5 (set3)
ext v28.8b, v22.8b , v23.8b , #2
smlal v2.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3)
smlal v2.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3)
smlsl v2.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
smlsl v2.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
ext v31.8b, v24.8b , v25.8b , #2
shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2)
ext v30.8b, v23.8b , v24.8b , #4
shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1)
ext v29.8b, v23.8b , v24.8b , #6
saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4)
ext v28.8b, v23.8b , v24.8b , #2
ext v31.8b, v25.8b , v25.8b , #2
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4)
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4)
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
ext v30.8b, v24.8b , v25.8b , #4
saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5)
ext v29.8b, v24.8b , v25.8b , #6
ext v31.8b, v24.8b , v25.8b , #2
shrn v28.4h, v2.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3)
ld1 {v2.8b, v3.8b, v4.8b}, [x0], x3 //// Load next Row data
smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5)
smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5)
smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4)
mov v20.d[1], v21.d[0]
sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2
////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values
//// ////////////// ROW 2 ///////////////////////
//// Process first vertical interpolated row
//// each column is
uaddl v20.8h, v5.8b, v2.8b //// a0 + a5 (column1,row0)
movi v31.8b, #5
umlal v20.8h, v11.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
umlal v20.8h, v14.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
umlsl v20.8h, v8.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
umlsl v20.8h, v17.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
mov v21.d[0], v20.d[1]
mov v28.d[1], v29.d[0]
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
uaddl v22.8h, v6.8b, v3.8b //// a0 + a5 (column2,row0)
umlal v22.8h, v12.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
umlal v22.8h, v15.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
umlsl v22.8h, v9.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
umlsl v22.8h, v18.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
mov v23.d[0], v22.d[1]
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
ext v30.8b, v20.8b , v21.8b , #4
uaddl v24.8h, v7.8b, v4.8b //// a0 + a5 (column3,row0)
ext v29.8b, v20.8b , v21.8b , #6
umlal v24.8h, v13.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
umlal v24.8h, v16.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
umlsl v24.8h, v10.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
umlsl v24.8h, v19.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
mov v25.d[0], v24.d[1]
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
sqrshrun v5.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
ext v31.8b, v21.8b , v22.8b , #2
sqrshrun v6.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
ext v28.8b, v20.8b , v21.8b , #2
saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1)
ext v31.8b, v22.8b , v23.8b , #2
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1)
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1)
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
ext v30.8b, v21.8b , v22.8b , #4
sqrshrun v7.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
ext v29.8b, v21.8b , v22.8b , #6
ext v28.8b, v21.8b , v22.8b , #2
saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2)
smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2)
smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2)
smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
ext v31.8b, v23.8b , v24.8b , #2
ext v5.8b, v5.8b , v6.8b , #2
ext v6.8b, v6.8b , v7.8b , #2
ext v7.8b, v7.8b , v7.8b , #2
st1 {v5.8b, v6.8b}, [x1], x12 //// store row1 - 1,1/2 grid
st1 {v7.h}[0], [x11], x12 //// store row1 - 1,1/2 grid
ext v30.8b, v22.8b , v23.8b , #4
ext v29.8b, v22.8b , v23.8b , #6
saddl v6.4s, v31.4h, v22.4h //// a0 + a5 (set3)
ext v28.8b, v22.8b , v23.8b , #2
smlal v6.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3)
smlal v6.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3)
smlsl v6.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
smlsl v6.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
ext v31.8b, v24.8b , v25.8b , #2
shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2)
ext v30.8b, v23.8b , v24.8b , #4
shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1)
ext v29.8b, v23.8b , v24.8b , #6
saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4)
ext v28.8b, v23.8b , v24.8b , #2
ext v31.8b, v25.8b , v25.8b , #2
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4)
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4)
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
ext v30.8b, v24.8b , v25.8b , #4
saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5)
ext v29.8b, v24.8b , v25.8b , #6
ext v31.8b, v24.8b , v25.8b , #2
shrn v28.4h, v6.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3)
ld1 {v5.8b, v6.8b, v7.8b}, [x0], x3 //// Load next Row data
smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5)
smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5)
smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4)
mov v20.d[1], v21.d[0]
sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2
////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values
//// ////////////// ROW 3 ///////////////////////
//// Process first vertical interpolated row
//// each column is
uaddl v20.8h, v8.8b, v5.8b //// a0 + a5 (column1,row0)
movi v31.8b, #5
umlal v20.8h, v14.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
umlal v20.8h, v17.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
umlsl v20.8h, v11.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
umlsl v20.8h, v2.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
mov v21.d[0], v20.d[1]
mov v28.d[1], v29.d[0]
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
uaddl v22.8h, v9.8b, v6.8b //// a0 + a5 (column2,row0)
umlal v22.8h, v15.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
umlal v22.8h, v18.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
umlsl v22.8h, v12.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
umlsl v22.8h, v3.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
mov v23.d[0], v22.d[1]
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
ext v30.8b, v20.8b , v21.8b , #4
uaddl v24.8h, v10.8b, v7.8b //// a0 + a5 (column3,row0)
ext v29.8b, v20.8b , v21.8b , #6
umlal v24.8h, v16.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
umlal v24.8h, v19.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
umlsl v24.8h, v13.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
umlsl v24.8h, v4.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
mov v25.d[0], v24.d[1]
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
st1 { v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
sqrshrun v8.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
ext v31.8b, v21.8b , v22.8b , #2
sqrshrun v9.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
ext v28.8b, v20.8b , v21.8b , #2
saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1)
ext v31.8b, v22.8b , v23.8b , #2
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1)
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1)
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
ext v30.8b, v21.8b , v22.8b , #4
sqrshrun v10.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
ext v29.8b, v21.8b , v22.8b , #6
ext v28.8b, v21.8b , v22.8b , #2
saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2)
smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2)
smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2)
smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
ext v31.8b, v23.8b , v24.8b , #2
ext v8.8b, v8.8b , v9.8b , #2
ext v9.8b, v9.8b , v10.8b , #2
ext v10.8b, v10.8b , v10.8b , #2
st1 {v8.8b, v9.8b}, [x1], x12 //// store row1 - 1,1/2 grid
st1 {v10.h}[0], [x11], x12 //// store row1 - 1,1/2 grid
ext v30.8b, v22.8b , v23.8b , #4
ext v29.8b, v22.8b , v23.8b , #6
saddl v8.4s, v31.4h, v22.4h //// a0 + a5 (set3)
ext v28.8b, v22.8b , v23.8b , #2
smlal v8.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3)
smlal v8.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3)
smlsl v8.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
smlsl v8.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
ext v31.8b, v24.8b , v25.8b , #2
shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2)
ext v30.8b, v23.8b , v24.8b , #4
shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1)
ext v29.8b, v23.8b , v24.8b , #6
saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4)
ext v28.8b, v23.8b , v24.8b , #2
ext v31.8b, v25.8b , v25.8b , #2
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4)
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4)
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
ext v30.8b, v24.8b , v25.8b , #4
saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5)
ext v29.8b, v24.8b , v25.8b , #6
ext v31.8b, v24.8b , v25.8b , #2
shrn v28.4h, v8.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3)
ld1 {v8.8b, v9.8b, v10.8b}, [x0], x3 //// Load next Row data
smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5)
smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5)
smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4)
mov v20.d[1], v21.d[0]
sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2
////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values
//// ////////////// ROW 4 ///////////////////////
//// Process first vertical interpolated row
//// each column is
uaddl v20.8h, v11.8b, v8.8b //// a0 + a5 (column1,row0)
movi v31.8b, #5
umlal v20.8h, v17.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
umlal v20.8h, v2.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
umlsl v20.8h, v14.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
umlsl v20.8h, v5.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
mov v21.d[0], v20.d[1]
mov v28.d[1], v29.d[0]
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
uaddl v22.8h, v12.8b, v9.8b //// a0 + a5 (column2,row0)
umlal v22.8h, v18.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
umlal v22.8h, v3.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
umlsl v22.8h, v15.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
umlsl v22.8h, v6.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
mov v23.d[0], v22.d[1]
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
ext v30.8b, v20.8b , v21.8b , #4
uaddl v24.8h, v13.8b, v10.8b //// a0 + a5 (column3,row0)
ext v29.8b, v20.8b , v21.8b , #6
umlal v24.8h, v19.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
umlal v24.8h, v4.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
umlsl v24.8h, v16.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
umlsl v24.8h, v7.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
mov v25.d[0], v24.d[1]
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
sqrshrun v11.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
ext v31.8b, v21.8b , v22.8b , #2
sqrshrun v12.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
ext v28.8b, v20.8b , v21.8b , #2
saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1)
ext v31.8b, v22.8b , v23.8b , #2
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1)
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1)
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
ext v30.8b, v21.8b , v22.8b , #4
sqrshrun v13.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
ext v29.8b, v21.8b , v22.8b , #6
ext v28.8b, v21.8b , v22.8b , #2
saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2)
smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2)
smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2)
smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
ext v31.8b, v23.8b , v24.8b , #2
ext v11.8b, v11.8b , v12.8b , #2
ext v12.8b, v12.8b , v13.8b , #2
ext v13.8b, v13.8b , v13.8b , #2
st1 {v11.8b, v12.8b}, [x1], x12 //// store row1 - 1,1/2 grid
st1 {v13.h}[0], [x11], x12 //// store row1 - 1,1/2 grid
ext v30.8b, v22.8b , v23.8b , #4
ext v29.8b, v22.8b , v23.8b , #6
saddl v12.4s, v31.4h, v22.4h //// a0 + a5 (set3)
ext v28.8b, v22.8b , v23.8b , #2
smlal v12.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3)
smlal v12.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3)
smlsl v12.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
smlsl v12.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
ext v31.8b, v24.8b , v25.8b , #2
shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2)
ext v30.8b, v23.8b , v24.8b , #4
shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1)
ext v29.8b, v23.8b , v24.8b , #6
saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4)
ext v28.8b, v23.8b , v24.8b , #2
ext v31.8b, v25.8b , v25.8b , #2
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4)
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4)
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
ext v30.8b, v24.8b , v25.8b , #4
saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5)
ext v29.8b, v24.8b , v25.8b , #6
ext v31.8b, v24.8b , v25.8b , #2
shrn v28.4h, v12.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3)
ld1 {v11.8b, v12.8b, v13.8b}, [x0], x3 //// Load next Row data
smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5)
smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5)
smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4)
mov v20.d[1], v21.d[0]
sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2
////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values
//// ////////////// ROW 5 ///////////////////////
//// Process first vertical interpolated row
//// each column is
uaddl v20.8h, v14.8b, v11.8b //// a0 + a5 (column1,row0)
movi v31.8b, #5
umlal v20.8h, v2.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
umlal v20.8h, v5.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
umlsl v20.8h, v17.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
umlsl v20.8h, v8.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
mov v21.d[0], v20.d[1]
mov v28.d[1], v29.d[0]
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
uaddl v22.8h, v15.8b, v12.8b //// a0 + a5 (column2,row0)
umlal v22.8h, v3.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
umlal v22.8h, v6.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
umlsl v22.8h, v18.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
umlsl v22.8h, v9.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
mov v23.d[0], v22.d[1]
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
ext v30.8b, v20.8b , v21.8b , #4
uaddl v24.8h, v16.8b, v13.8b //// a0 + a5 (column3,row0)
ext v29.8b, v20.8b , v21.8b , #6
umlal v24.8h, v4.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
umlal v24.8h, v7.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
umlsl v24.8h, v19.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
umlsl v24.8h, v10.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
mov v25.d[0], v24.d[1]
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
sqrshrun v14.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
ext v31.8b, v21.8b , v22.8b , #2
sqrshrun v15.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
ext v28.8b, v20.8b , v21.8b , #2
saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1)
ext v31.8b, v22.8b , v23.8b , #2
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1)
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1)
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
ext v30.8b, v21.8b , v22.8b , #4
sqrshrun v16.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
ext v29.8b, v21.8b , v22.8b , #6
ext v28.8b, v21.8b , v22.8b , #2
saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2)
smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2)
smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2)
smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
ext v31.8b, v23.8b , v24.8b , #2
ext v14.8b, v14.8b , v15.8b , #2
ext v15.8b, v15.8b , v16.8b , #2
ext v16.8b, v16.8b , v16.8b , #2
st1 {v14.8b, v15.8b}, [x1], x12 //// store row1 - 1,1/2 grid
st1 {v16.h}[0], [x11], x12 //// store row1 - 1,1/2 grid
ext v30.8b, v22.8b , v23.8b , #4
ext v29.8b, v22.8b , v23.8b , #6
saddl v14.4s, v31.4h, v22.4h //// a0 + a5 (set3)
ext v28.8b, v22.8b , v23.8b , #2
smlal v14.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3)
smlal v14.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3)
smlsl v14.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
smlsl v14.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
ext v31.8b, v24.8b , v25.8b , #2
shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2)
ext v30.8b, v23.8b , v24.8b , #4
shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1)
ext v29.8b, v23.8b , v24.8b , #6
saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4)
ext v28.8b, v23.8b , v24.8b , #2
ext v31.8b, v25.8b , v25.8b , #2
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4)
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4)
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
ext v30.8b, v24.8b , v25.8b , #4
saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5)
ext v29.8b, v24.8b , v25.8b , #6
ext v31.8b, v24.8b , v25.8b , #2
shrn v28.4h, v14.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3)
ld1 {v14.8b, v15.8b, v16.8b}, [x0], x3 //// Load next Row data
smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5)
smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5)
smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4)
mov v20.d[1], v21.d[0]
sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2
////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values
//// ////////////// ROW 6 ///////////////////////
//// Process first vertical interpolated row
//// each column is
cmp x10, #1 //// if it 17 rows are complete skip
beq filter_2dvh_skip_row
uaddl v20.8h, v17.8b, v14.8b //// a0 + a5 (column1,row0)
movi v31.8b, #5
umlal v20.8h, v5.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
umlal v20.8h, v8.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
umlsl v20.8h, v2.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
umlsl v20.8h, v11.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
mov v21.d[0], v20.d[1]
mov v28.d[1], v29.d[0]
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
uaddl v22.8h, v18.8b, v15.8b //// a0 + a5 (column2,row0)
umlal v22.8h, v6.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
umlal v22.8h, v9.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
umlsl v22.8h, v3.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
umlsl v22.8h, v12.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
mov v23.d[0], v22.d[1]
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
ext v30.8b, v20.8b , v21.8b , #4
uaddl v24.8h, v19.8b, v16.8b //// a0 + a5 (column3,row0)
ext v29.8b, v20.8b , v21.8b , #6
umlal v24.8h, v7.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
umlal v24.8h, v10.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
umlsl v24.8h, v4.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
umlsl v24.8h, v13.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
mov v25.d[0], v24.d[1]
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
sqrshrun v17.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
ext v31.8b, v21.8b , v22.8b , #2
sqrshrun v18.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
ext v28.8b, v20.8b , v21.8b , #2
saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1)
ext v31.8b, v22.8b , v23.8b , #2
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1)
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1)
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
ext v30.8b, v21.8b , v22.8b , #4
sqrshrun v19.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
ext v29.8b, v21.8b , v22.8b , #6
ext v28.8b, v21.8b , v22.8b , #2
saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2)
smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2)
smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2)
smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
ext v31.8b, v23.8b , v24.8b , #2
ext v17.8b, v17.8b , v18.8b , #2
ext v18.8b, v18.8b , v19.8b , #2
ext v19.8b, v19.8b , v19.8b , #2
st1 {v17.8b, v18.8b}, [x1], x12 //// store row1 - 1,1/2 grid
st1 {v19.h}[0], [x11], x12 //// store row1 - 1,1/2 grid
ext v30.8b, v22.8b , v23.8b , #4
ext v29.8b, v22.8b , v23.8b , #6
saddl v18.4s, v31.4h, v22.4h //// a0 + a5 (set3)
ext v28.8b, v22.8b , v23.8b , #2
smlal v18.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3)
smlal v18.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3)
smlsl v18.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
smlsl v18.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
ext v31.8b, v24.8b , v25.8b , #2
shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2)
ext v30.8b, v23.8b , v24.8b , #4
shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1)
ext v29.8b, v23.8b , v24.8b , #6
saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4)
ext v28.8b, v23.8b , v24.8b , #2
ext v31.8b, v25.8b , v25.8b , #2
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4)
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4)
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
ext v30.8b, v24.8b , v25.8b , #4
saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5)
ext v29.8b, v24.8b , v25.8b , #6
ext v31.8b, v24.8b , v25.8b , #2
shrn v28.4h, v18.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3)
ld1 {v17.8b, v18.8b, v19.8b}, [x0], x3 //// Load next Row data
smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5)
smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5)
smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4)
mov v20.d[1], v21.d[0]
sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2
mov v28.d[1], v29.d[0]
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
subs x10, x10, #1 ////decrement loop counter
bne filter_2dvh_loop
//// Process first vertical interpolated row
//// each column is
//// ////////////// ROW 13 ///////////////////////
//// Process first vertical interpolated row
//// each column is
// LDMFD sp!,{x10,x11,x12,pc}
ldp x19, x20, [sp], #16
pop_v_regs
ret
filter_2dvh_skip_row:
mov v28.d[1], v29.d[0]
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
// LDMFD sp!,{x10,x11,x12,pc}
ldp x19, x20, [sp], #16
pop_v_regs
ret
///*****************************************