You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1021 lines
52 KiB
1021 lines
52 KiB
//******************************************************************************
|
|
//*
|
|
//* Copyright (C) 2015 The Android Open Source Project
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************
|
|
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
///**
|
|
// *******************************************************************************
|
|
// * @file
|
|
// * ih264e_half_pel.s
|
|
// *
|
|
// * @brief
|
|
// *
|
|
// *
|
|
// * @author
|
|
// * Ittiam
|
|
// *
|
|
// * @par List of Functions:
|
|
// * ih264e_sixtapfilter_horz
|
|
// * ih264e_sixtap_filter_2dvh_vert
|
|
//
|
|
// *
|
|
// * @remarks
|
|
// * None
|
|
// *
|
|
// *******************************************************************************
|
|
// */
|
|
|
|
|
|
.text
|
|
.p2align 2
|
|
.include "ih264_neon_macros.s"
|
|
|
|
///*******************************************************************************
|
|
//*
|
|
//* @brief
|
|
//* Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16)
|
|
//*
|
|
//* @par Description:
|
|
//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits
|
|
//* sec 8.4.2.2.1 titled "Luma sample interpolation process"
|
|
//*
|
|
//* @param[in] pu1_src
|
|
//* UWORD8 pointer to the source
|
|
//*
|
|
//* @param[out] pu1_dst
|
|
//* UWORD8 pointer to the destination
|
|
//*
|
|
//* @param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* @param[in] dst_strd
|
|
//* integer destination stride
|
|
//*
|
|
//*
|
|
//* @returns
|
|
//*
|
|
//* @remarks
|
|
//* None
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//void ih264e_sixtapfilter_horz(UWORD8 *pu1_src,
|
|
// UWORD8 *pu1_dst,
|
|
// WORD32 src_strd,
|
|
// WORD32 dst_strd);
|
|
|
|
|
|
.equ halfpel_width , 17 + 1 //( make it even, two rows are processed at a time)
|
|
|
|
|
|
.global ih264e_sixtapfilter_horz_av8
|
|
ih264e_sixtapfilter_horz_av8:
|
|
// STMFD sp!,{x14}
|
|
push_v_regs
|
|
sxtw x2, w2
|
|
sxtw x3, w3
|
|
stp x19, x20, [sp, #-16]!
|
|
|
|
movi v0.8b, #5
|
|
sub x0, x0, #2
|
|
sub x3, x3, #16
|
|
movi v1.8b, #20
|
|
mov x14, #16
|
|
|
|
filter_horz_loop:
|
|
|
|
|
|
ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0
|
|
ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1
|
|
|
|
//// Processing row0 and row1
|
|
|
|
ext v31.8b, v2.8b , v3.8b , #5
|
|
ext v30.8b, v3.8b , v4.8b , #5
|
|
|
|
uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0)
|
|
ext v29.8b, v4.8b , v4.8b , #5
|
|
uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0)
|
|
ext v28.8b, v5.8b , v6.8b , #5
|
|
uaddl v12.8h, v29.8b, v4.8b //// a0 + a5 (column3,row0)
|
|
ext v27.8b, v6.8b , v7.8b , #5
|
|
uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1)
|
|
ext v26.8b, v7.8b , v7.8b , #5
|
|
|
|
uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1)
|
|
ext v31.8b, v2.8b , v3.8b , #2
|
|
uaddl v18.8h, v26.8b, v7.8b //// a0 + a5 (column3,row1)
|
|
ext v30.8b, v3.8b , v4.8b , #2
|
|
umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
|
|
ext v29.8b, v4.8b , v4.8b , #2
|
|
umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
|
|
ext v28.8b, v5.8b , v6.8b , #2
|
|
umlal v12.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
|
|
ext v27.8b, v6.8b , v7.8b , #2
|
|
umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1)
|
|
ext v26.8b, v7.8b , v7.8b , #2
|
|
|
|
umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1)
|
|
ext v31.8b, v2.8b , v3.8b , #3
|
|
umlal v18.8h, v26.8b, v1.8b //// a0 + a5 + 20a2 (column3,row1)
|
|
ext v30.8b, v3.8b , v4.8b , #3
|
|
umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
|
|
ext v29.8b, v4.8b , v4.8b , #3
|
|
umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
|
|
ext v28.8b, v5.8b , v6.8b , #3
|
|
umlal v12.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
|
|
ext v27.8b, v6.8b , v7.8b , #3
|
|
umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1)
|
|
ext v26.8b, v7.8b , v7.8b , #3
|
|
|
|
umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1)
|
|
ext v31.8b, v2.8b , v3.8b , #1
|
|
umlal v18.8h, v26.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row1)
|
|
ext v30.8b, v3.8b , v4.8b , #1
|
|
umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
|
|
ext v29.8b, v4.8b , v4.8b , #1
|
|
umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
|
|
ext v28.8b, v5.8b , v6.8b , #1
|
|
umlsl v12.8h, v29.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
|
|
ext v27.8b, v6.8b , v7.8b , #1
|
|
umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1)
|
|
ext v26.8b, v7.8b , v7.8b , #1
|
|
|
|
umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1)
|
|
ext v31.8b, v2.8b , v3.8b , #4
|
|
umlsl v18.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row1)
|
|
ext v30.8b, v3.8b , v4.8b , #4
|
|
umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
|
|
ext v29.8b, v4.8b , v4.8b , #4
|
|
umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
|
|
ext v28.8b, v5.8b , v6.8b , #4
|
|
umlsl v12.8h, v29.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
|
|
ext v27.8b, v6.8b , v7.8b , #4
|
|
umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1)
|
|
ext v26.8b, v7.8b , v7.8b , #4
|
|
|
|
umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1)
|
|
umlsl v18.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row1)
|
|
|
|
sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
|
|
sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
|
|
sqrshrun v22.8b, v12.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
|
|
sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1)
|
|
sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1)
|
|
sqrshrun v25.8b, v18.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row1)
|
|
|
|
st1 {v20.8b, v21.8b}, [x1], #16 ////Store dest row0
|
|
st1 {v22.h}[0], [x1], x3
|
|
st1 {v23.8b, v24.8b}, [x1], #16 ////Store dest row1
|
|
st1 {v25.h}[0], [x1], x3
|
|
|
|
subs x14, x14, #2 // decrement counter
|
|
|
|
bne filter_horz_loop
|
|
|
|
|
|
// LDMFD sp!,{pc}
|
|
ldp x19, x20, [sp], #16
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* @brief
|
|
//* This function implements a two stage cascaded six tap filter. It
|
|
//* applies the six tap filter in the vertical direction on the
|
|
//* predictor values, followed by applying the same filter in the
|
|
//* horizontal direction on the output of the first stage. The six tap
|
|
//* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
|
|
//* interpolation process"
|
|
//* (Filter run for width = 17 and height =17)
|
|
//* @par Description:
|
|
//* The function interpolates
|
|
//* the predictors first in the vertical direction and then in the
|
|
//* horizontal direction to output the (1/2,1/2). The output of the first
|
|
//* stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C)
|
|
//* in 16 bit precision.
|
|
//*
|
|
//*
|
|
//* @param[in] pu1_src
|
|
//* UWORD8 pointer to the source
|
|
//*
|
|
//* @param[out] pu1_dst1
|
|
//* UWORD8 pointer to the destination(vertical filtered output)
|
|
//*
|
|
//* @param[out] pu1_dst2
|
|
//* UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output)
|
|
//*
|
|
//* @param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* @param[in] dst_strd
|
|
//* integer destination stride of pu1_dst
|
|
//*
|
|
//* @param[in]pi16_pred1
|
|
//* Pointer to 16bit intermediate buffer(used only in c)
|
|
//*
|
|
//* @param[in] pi16_pred1_strd
|
|
//* integer destination stride of pi16_pred1
|
|
//*
|
|
//*
|
|
//* @returns
|
|
//*
|
|
//* @remarks
|
|
//* None
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src,
|
|
// UWORD8 *pu1_dst1,
|
|
// UWORD8 *pu1_dst2,
|
|
// WORD32 src_strd,
|
|
// WORD32 dst_strd,
|
|
// WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/
|
|
// WORD32 pi16_pred1_strd)
|
|
|
|
|
|
|
|
|
|
.global ih264e_sixtap_filter_2dvh_vert_av8
|
|
|
|
ih264e_sixtap_filter_2dvh_vert_av8:
|
|
// STMFD sp!,{x10,x11,x12,x14}
|
|
push_v_regs
|
|
sxtw x3, w3
|
|
sxtw x4, w4
|
|
stp x19, x20, [sp, #-16]!
|
|
|
|
////x0 - pu1_ref
|
|
////x3 - u4_ref_width
|
|
|
|
//// Load six rows for vertical interpolation
|
|
lsl x12, x3, #1
|
|
sub x0, x0, x12
|
|
sub x0, x0, #2
|
|
ld1 {v2.8b, v3.8b, v4.8b}, [x0], x3
|
|
ld1 {v5.8b, v6.8b, v7.8b}, [x0], x3
|
|
ld1 {v8.8b, v9.8b, v10.8b}, [x0], x3
|
|
mov x12, #5
|
|
ld1 {v11.8b, v12.8b, v13.8b}, [x0], x3
|
|
mov x14, #20
|
|
ld1 {v14.8b, v15.8b, v16.8b}, [x0], x3
|
|
mov v0.h[0], w12
|
|
mov v0.h[1], w14
|
|
ld1 {v17.8b, v18.8b, v19.8b}, [x0], x3
|
|
movi v1.8b, #20
|
|
|
|
//// x12 - u2_buff1_width
|
|
//// x14 - u2_buff2_width
|
|
mov x12, x4
|
|
add x11, x1, #16
|
|
|
|
mov x14, x12
|
|
|
|
mov x10, #3 //loop counter
|
|
sub x16 , x12, #8
|
|
sub x19, x14, #16
|
|
filter_2dvh_loop:
|
|
|
|
//// ////////////// ROW 1 ///////////////////////
|
|
|
|
//// Process first vertical interpolated row
|
|
//// each column is
|
|
uaddl v20.8h, v2.8b, v17.8b //// a0 + a5 (column1,row0)
|
|
movi v31.8b, #5
|
|
umlal v20.8h, v8.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
|
|
umlal v20.8h, v11.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
|
|
umlsl v20.8h, v5.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
|
|
umlsl v20.8h, v14.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
|
|
mov v21.d[0], v20.d[1]
|
|
|
|
uaddl v22.8h, v3.8b, v18.8b //// a0 + a5 (column2,row0)
|
|
umlal v22.8h, v9.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
|
|
umlal v22.8h, v12.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
|
|
umlsl v22.8h, v6.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
|
|
umlsl v22.8h, v15.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
|
|
ext v30.8b, v20.8b , v21.8b , #4
|
|
mov v23.d[0], v22.d[1]
|
|
|
|
|
|
uaddl v24.8h, v4.8b, v19.8b //// a0 + a5 (column3,row0)
|
|
ext v29.8b, v20.8b , v21.8b , #6
|
|
umlal v24.8h, v10.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
|
|
umlal v24.8h, v13.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
|
|
umlsl v24.8h, v7.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
|
|
umlsl v24.8h, v16.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
|
|
mov v25.d[0], v24.d[1]
|
|
|
|
sqrshrun v2.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
|
|
ext v31.8b, v21.8b , v22.8b , #2
|
|
sqrshrun v3.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
|
|
ext v28.8b, v20.8b , v21.8b , #2
|
|
|
|
saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1)
|
|
ext v31.8b, v22.8b , v23.8b , #2
|
|
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1)
|
|
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1)
|
|
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
|
|
smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
|
|
ext v30.8b, v21.8b , v22.8b , #4
|
|
|
|
sqrshrun v4.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
|
|
ext v29.8b, v21.8b , v22.8b , #6
|
|
|
|
ext v28.8b, v21.8b , v22.8b , #2
|
|
saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2)
|
|
smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2)
|
|
smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2)
|
|
smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
|
|
smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
|
|
ext v31.8b, v23.8b , v24.8b , #2
|
|
mov v21.d[0], v20.d[1]
|
|
ext v2.8b, v2.8b , v3.8b , #2
|
|
ext v3.8b, v3.8b , v4.8b , #2
|
|
ext v4.8b, v4.8b , v4.8b , #2
|
|
|
|
st1 {v2.8b, v3.8b}, [x1], x12 //// store row1 - 1,1/2 grid
|
|
st1 {v4.h}[0], [x11], x12 //// store row1 - 1,1/2 grid
|
|
|
|
ext v30.8b, v22.8b , v23.8b , #4
|
|
ext v29.8b, v22.8b , v23.8b , #6
|
|
|
|
saddl v2.4s, v31.4h, v22.4h //// a0 + a5 (set3)
|
|
ext v28.8b, v22.8b , v23.8b , #2
|
|
smlal v2.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3)
|
|
smlal v2.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3)
|
|
smlsl v2.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
|
|
smlsl v2.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
|
|
ext v31.8b, v24.8b , v25.8b , #2
|
|
|
|
shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2)
|
|
ext v30.8b, v23.8b , v24.8b , #4
|
|
shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1)
|
|
ext v29.8b, v23.8b , v24.8b , #6
|
|
|
|
saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4)
|
|
ext v28.8b, v23.8b , v24.8b , #2
|
|
ext v31.8b, v25.8b , v25.8b , #2
|
|
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4)
|
|
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4)
|
|
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
|
|
smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
|
|
ext v30.8b, v24.8b , v25.8b , #4
|
|
|
|
saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5)
|
|
ext v29.8b, v24.8b , v25.8b , #6
|
|
|
|
ext v31.8b, v24.8b , v25.8b , #2
|
|
shrn v28.4h, v2.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3)
|
|
|
|
ld1 {v2.8b, v3.8b, v4.8b}, [x0], x3 //// Load next Row data
|
|
smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5)
|
|
smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5)
|
|
smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
|
|
smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
|
|
shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4)
|
|
mov v20.d[1], v21.d[0]
|
|
sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2
|
|
|
|
|
|
////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
|
|
////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
|
|
|
|
////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
|
|
|
|
////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values
|
|
//// ////////////// ROW 2 ///////////////////////
|
|
|
|
//// Process first vertical interpolated row
|
|
//// each column is
|
|
uaddl v20.8h, v5.8b, v2.8b //// a0 + a5 (column1,row0)
|
|
movi v31.8b, #5
|
|
umlal v20.8h, v11.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
|
|
umlal v20.8h, v14.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
|
|
umlsl v20.8h, v8.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
|
|
umlsl v20.8h, v17.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
|
|
mov v21.d[0], v20.d[1]
|
|
|
|
mov v28.d[1], v29.d[0]
|
|
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
|
|
|
|
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
|
|
|
|
uaddl v22.8h, v6.8b, v3.8b //// a0 + a5 (column2,row0)
|
|
umlal v22.8h, v12.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
|
|
umlal v22.8h, v15.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
|
|
umlsl v22.8h, v9.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
|
|
umlsl v22.8h, v18.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
|
|
mov v23.d[0], v22.d[1]
|
|
|
|
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
|
|
ext v30.8b, v20.8b , v21.8b , #4
|
|
|
|
uaddl v24.8h, v7.8b, v4.8b //// a0 + a5 (column3,row0)
|
|
ext v29.8b, v20.8b , v21.8b , #6
|
|
umlal v24.8h, v13.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
|
|
umlal v24.8h, v16.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
|
|
umlsl v24.8h, v10.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
|
|
umlsl v24.8h, v19.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
|
|
mov v25.d[0], v24.d[1]
|
|
|
|
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
|
|
st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
|
|
|
|
sqrshrun v5.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
|
|
ext v31.8b, v21.8b , v22.8b , #2
|
|
sqrshrun v6.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
|
|
ext v28.8b, v20.8b , v21.8b , #2
|
|
|
|
saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1)
|
|
ext v31.8b, v22.8b , v23.8b , #2
|
|
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1)
|
|
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1)
|
|
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
|
|
smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
|
|
ext v30.8b, v21.8b , v22.8b , #4
|
|
|
|
sqrshrun v7.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
|
|
ext v29.8b, v21.8b , v22.8b , #6
|
|
|
|
ext v28.8b, v21.8b , v22.8b , #2
|
|
saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2)
|
|
smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2)
|
|
smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2)
|
|
smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
|
|
smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
|
|
ext v31.8b, v23.8b , v24.8b , #2
|
|
|
|
ext v5.8b, v5.8b , v6.8b , #2
|
|
ext v6.8b, v6.8b , v7.8b , #2
|
|
ext v7.8b, v7.8b , v7.8b , #2
|
|
|
|
st1 {v5.8b, v6.8b}, [x1], x12 //// store row1 - 1,1/2 grid
|
|
st1 {v7.h}[0], [x11], x12 //// store row1 - 1,1/2 grid
|
|
|
|
ext v30.8b, v22.8b , v23.8b , #4
|
|
ext v29.8b, v22.8b , v23.8b , #6
|
|
|
|
saddl v6.4s, v31.4h, v22.4h //// a0 + a5 (set3)
|
|
ext v28.8b, v22.8b , v23.8b , #2
|
|
smlal v6.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3)
|
|
smlal v6.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3)
|
|
smlsl v6.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
|
|
smlsl v6.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
|
|
ext v31.8b, v24.8b , v25.8b , #2
|
|
|
|
shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2)
|
|
ext v30.8b, v23.8b , v24.8b , #4
|
|
shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1)
|
|
ext v29.8b, v23.8b , v24.8b , #6
|
|
|
|
saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4)
|
|
ext v28.8b, v23.8b , v24.8b , #2
|
|
ext v31.8b, v25.8b , v25.8b , #2
|
|
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4)
|
|
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4)
|
|
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
|
|
smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
|
|
ext v30.8b, v24.8b , v25.8b , #4
|
|
|
|
saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5)
|
|
ext v29.8b, v24.8b , v25.8b , #6
|
|
|
|
ext v31.8b, v24.8b , v25.8b , #2
|
|
shrn v28.4h, v6.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3)
|
|
|
|
ld1 {v5.8b, v6.8b, v7.8b}, [x0], x3 //// Load next Row data
|
|
smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5)
|
|
smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5)
|
|
smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
|
|
smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
|
|
shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4)
|
|
mov v20.d[1], v21.d[0]
|
|
sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2
|
|
|
|
|
|
////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
|
|
////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
|
|
|
|
////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
|
|
|
|
////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values
|
|
//// ////////////// ROW 3 ///////////////////////
|
|
|
|
//// Process first vertical interpolated row
|
|
//// each column is
|
|
uaddl v20.8h, v8.8b, v5.8b //// a0 + a5 (column1,row0)
|
|
movi v31.8b, #5
|
|
umlal v20.8h, v14.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
|
|
umlal v20.8h, v17.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
|
|
umlsl v20.8h, v11.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
|
|
umlsl v20.8h, v2.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
|
|
mov v21.d[0], v20.d[1]
|
|
|
|
mov v28.d[1], v29.d[0]
|
|
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
|
|
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
|
|
|
|
uaddl v22.8h, v9.8b, v6.8b //// a0 + a5 (column2,row0)
|
|
umlal v22.8h, v15.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
|
|
umlal v22.8h, v18.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
|
|
umlsl v22.8h, v12.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
|
|
umlsl v22.8h, v3.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
|
|
mov v23.d[0], v22.d[1]
|
|
|
|
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
|
|
ext v30.8b, v20.8b , v21.8b , #4
|
|
|
|
uaddl v24.8h, v10.8b, v7.8b //// a0 + a5 (column3,row0)
|
|
ext v29.8b, v20.8b , v21.8b , #6
|
|
umlal v24.8h, v16.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
|
|
umlal v24.8h, v19.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
|
|
umlsl v24.8h, v13.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
|
|
umlsl v24.8h, v4.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
|
|
mov v25.d[0], v24.d[1]
|
|
|
|
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
|
|
st1 { v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
|
|
|
|
sqrshrun v8.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
|
|
ext v31.8b, v21.8b , v22.8b , #2
|
|
sqrshrun v9.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
|
|
ext v28.8b, v20.8b , v21.8b , #2
|
|
|
|
saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1)
|
|
ext v31.8b, v22.8b , v23.8b , #2
|
|
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1)
|
|
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1)
|
|
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
|
|
smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
|
|
ext v30.8b, v21.8b , v22.8b , #4
|
|
|
|
sqrshrun v10.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
|
|
ext v29.8b, v21.8b , v22.8b , #6
|
|
|
|
ext v28.8b, v21.8b , v22.8b , #2
|
|
saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2)
|
|
smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2)
|
|
smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2)
|
|
smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
|
|
smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
|
|
ext v31.8b, v23.8b , v24.8b , #2
|
|
|
|
ext v8.8b, v8.8b , v9.8b , #2
|
|
ext v9.8b, v9.8b , v10.8b , #2
|
|
ext v10.8b, v10.8b , v10.8b , #2
|
|
|
|
st1 {v8.8b, v9.8b}, [x1], x12 //// store row1 - 1,1/2 grid
|
|
st1 {v10.h}[0], [x11], x12 //// store row1 - 1,1/2 grid
|
|
|
|
ext v30.8b, v22.8b , v23.8b , #4
|
|
ext v29.8b, v22.8b , v23.8b , #6
|
|
|
|
saddl v8.4s, v31.4h, v22.4h //// a0 + a5 (set3)
|
|
ext v28.8b, v22.8b , v23.8b , #2
|
|
smlal v8.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3)
|
|
smlal v8.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3)
|
|
smlsl v8.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
|
|
smlsl v8.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
|
|
ext v31.8b, v24.8b , v25.8b , #2
|
|
|
|
shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2)
|
|
ext v30.8b, v23.8b , v24.8b , #4
|
|
shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1)
|
|
ext v29.8b, v23.8b , v24.8b , #6
|
|
|
|
saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4)
|
|
ext v28.8b, v23.8b , v24.8b , #2
|
|
ext v31.8b, v25.8b , v25.8b , #2
|
|
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4)
|
|
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4)
|
|
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
|
|
smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
|
|
ext v30.8b, v24.8b , v25.8b , #4
|
|
|
|
saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5)
|
|
ext v29.8b, v24.8b , v25.8b , #6
|
|
|
|
ext v31.8b, v24.8b , v25.8b , #2
|
|
shrn v28.4h, v8.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3)
|
|
|
|
ld1 {v8.8b, v9.8b, v10.8b}, [x0], x3 //// Load next Row data
|
|
smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5)
|
|
smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5)
|
|
smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
|
|
smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
|
|
shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4)
|
|
mov v20.d[1], v21.d[0]
|
|
sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2
|
|
|
|
|
|
////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
|
|
////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
|
|
|
|
////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
|
|
|
|
////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values
|
|
//// ////////////// ROW 4 ///////////////////////
|
|
|
|
//// Process first vertical interpolated row
|
|
//// each column is
|
|
uaddl v20.8h, v11.8b, v8.8b //// a0 + a5 (column1,row0)
|
|
movi v31.8b, #5
|
|
umlal v20.8h, v17.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
|
|
umlal v20.8h, v2.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
|
|
umlsl v20.8h, v14.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
|
|
umlsl v20.8h, v5.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
|
|
mov v21.d[0], v20.d[1]
|
|
mov v28.d[1], v29.d[0]
|
|
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
|
|
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
|
|
|
|
uaddl v22.8h, v12.8b, v9.8b //// a0 + a5 (column2,row0)
|
|
umlal v22.8h, v18.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
|
|
umlal v22.8h, v3.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
|
|
umlsl v22.8h, v15.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
|
|
umlsl v22.8h, v6.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
|
|
mov v23.d[0], v22.d[1]
|
|
|
|
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
|
|
ext v30.8b, v20.8b , v21.8b , #4
|
|
|
|
uaddl v24.8h, v13.8b, v10.8b //// a0 + a5 (column3,row0)
|
|
ext v29.8b, v20.8b , v21.8b , #6
|
|
umlal v24.8h, v19.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
|
|
umlal v24.8h, v4.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
|
|
umlsl v24.8h, v16.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
|
|
umlsl v24.8h, v7.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
|
|
mov v25.d[0], v24.d[1]
|
|
|
|
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
|
|
st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
|
|
|
|
sqrshrun v11.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
|
|
ext v31.8b, v21.8b , v22.8b , #2
|
|
sqrshrun v12.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
|
|
ext v28.8b, v20.8b , v21.8b , #2
|
|
|
|
saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1)
|
|
ext v31.8b, v22.8b , v23.8b , #2
|
|
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1)
|
|
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1)
|
|
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
|
|
smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
|
|
ext v30.8b, v21.8b , v22.8b , #4
|
|
|
|
sqrshrun v13.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
|
|
ext v29.8b, v21.8b , v22.8b , #6
|
|
|
|
ext v28.8b, v21.8b , v22.8b , #2
|
|
saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2)
|
|
smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2)
|
|
smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2)
|
|
smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
|
|
smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
|
|
ext v31.8b, v23.8b , v24.8b , #2
|
|
|
|
ext v11.8b, v11.8b , v12.8b , #2
|
|
ext v12.8b, v12.8b , v13.8b , #2
|
|
ext v13.8b, v13.8b , v13.8b , #2
|
|
|
|
st1 {v11.8b, v12.8b}, [x1], x12 //// store row1 - 1,1/2 grid
|
|
st1 {v13.h}[0], [x11], x12 //// store row1 - 1,1/2 grid
|
|
|
|
ext v30.8b, v22.8b , v23.8b , #4
|
|
ext v29.8b, v22.8b , v23.8b , #6
|
|
|
|
saddl v12.4s, v31.4h, v22.4h //// a0 + a5 (set3)
|
|
ext v28.8b, v22.8b , v23.8b , #2
|
|
smlal v12.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3)
|
|
smlal v12.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3)
|
|
smlsl v12.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
|
|
smlsl v12.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
|
|
ext v31.8b, v24.8b , v25.8b , #2
|
|
|
|
shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2)
|
|
ext v30.8b, v23.8b , v24.8b , #4
|
|
shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1)
|
|
ext v29.8b, v23.8b , v24.8b , #6
|
|
|
|
saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4)
|
|
ext v28.8b, v23.8b , v24.8b , #2
|
|
ext v31.8b, v25.8b , v25.8b , #2
|
|
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4)
|
|
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4)
|
|
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
|
|
smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
|
|
ext v30.8b, v24.8b , v25.8b , #4
|
|
|
|
saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5)
|
|
ext v29.8b, v24.8b , v25.8b , #6
|
|
|
|
ext v31.8b, v24.8b , v25.8b , #2
|
|
shrn v28.4h, v12.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3)
|
|
|
|
ld1 {v11.8b, v12.8b, v13.8b}, [x0], x3 //// Load next Row data
|
|
smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5)
|
|
smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5)
|
|
smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
|
|
smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
|
|
shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4)
|
|
mov v20.d[1], v21.d[0]
|
|
sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2
|
|
|
|
|
|
////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
|
|
////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
|
|
|
|
////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
|
|
|
|
////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values
|
|
//// ////////////// ROW 5 ///////////////////////
|
|
|
|
//// Process first vertical interpolated row
|
|
//// each column is
|
|
uaddl v20.8h, v14.8b, v11.8b //// a0 + a5 (column1,row0)
|
|
movi v31.8b, #5
|
|
umlal v20.8h, v2.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
|
|
umlal v20.8h, v5.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
|
|
umlsl v20.8h, v17.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
|
|
umlsl v20.8h, v8.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
|
|
mov v21.d[0], v20.d[1]
|
|
mov v28.d[1], v29.d[0]
|
|
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
|
|
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
|
|
|
|
uaddl v22.8h, v15.8b, v12.8b //// a0 + a5 (column2,row0)
|
|
umlal v22.8h, v3.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
|
|
umlal v22.8h, v6.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
|
|
umlsl v22.8h, v18.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
|
|
umlsl v22.8h, v9.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
|
|
mov v23.d[0], v22.d[1]
|
|
|
|
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
|
|
ext v30.8b, v20.8b , v21.8b , #4
|
|
|
|
uaddl v24.8h, v16.8b, v13.8b //// a0 + a5 (column3,row0)
|
|
ext v29.8b, v20.8b , v21.8b , #6
|
|
umlal v24.8h, v4.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
|
|
umlal v24.8h, v7.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
|
|
umlsl v24.8h, v19.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
|
|
umlsl v24.8h, v10.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
|
|
mov v25.d[0], v24.d[1]
|
|
|
|
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
|
|
st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
|
|
|
|
sqrshrun v14.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
|
|
ext v31.8b, v21.8b , v22.8b , #2
|
|
sqrshrun v15.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
|
|
ext v28.8b, v20.8b , v21.8b , #2
|
|
|
|
saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1)
|
|
ext v31.8b, v22.8b , v23.8b , #2
|
|
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1)
|
|
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1)
|
|
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
|
|
smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
|
|
ext v30.8b, v21.8b , v22.8b , #4
|
|
|
|
sqrshrun v16.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
|
|
ext v29.8b, v21.8b , v22.8b , #6
|
|
|
|
ext v28.8b, v21.8b , v22.8b , #2
|
|
saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2)
|
|
smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2)
|
|
smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2)
|
|
smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
|
|
smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
|
|
ext v31.8b, v23.8b , v24.8b , #2
|
|
|
|
ext v14.8b, v14.8b , v15.8b , #2
|
|
ext v15.8b, v15.8b , v16.8b , #2
|
|
ext v16.8b, v16.8b , v16.8b , #2
|
|
|
|
st1 {v14.8b, v15.8b}, [x1], x12 //// store row1 - 1,1/2 grid
|
|
st1 {v16.h}[0], [x11], x12 //// store row1 - 1,1/2 grid
|
|
|
|
ext v30.8b, v22.8b , v23.8b , #4
|
|
ext v29.8b, v22.8b , v23.8b , #6
|
|
|
|
saddl v14.4s, v31.4h, v22.4h //// a0 + a5 (set3)
|
|
ext v28.8b, v22.8b , v23.8b , #2
|
|
smlal v14.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3)
|
|
smlal v14.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3)
|
|
smlsl v14.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
|
|
smlsl v14.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
|
|
ext v31.8b, v24.8b , v25.8b , #2
|
|
|
|
shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2)
|
|
ext v30.8b, v23.8b , v24.8b , #4
|
|
shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1)
|
|
ext v29.8b, v23.8b , v24.8b , #6
|
|
|
|
saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4)
|
|
ext v28.8b, v23.8b , v24.8b , #2
|
|
ext v31.8b, v25.8b , v25.8b , #2
|
|
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4)
|
|
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4)
|
|
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
|
|
smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
|
|
ext v30.8b, v24.8b , v25.8b , #4
|
|
|
|
saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5)
|
|
ext v29.8b, v24.8b , v25.8b , #6
|
|
|
|
ext v31.8b, v24.8b , v25.8b , #2
|
|
shrn v28.4h, v14.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3)
|
|
|
|
ld1 {v14.8b, v15.8b, v16.8b}, [x0], x3 //// Load next Row data
|
|
smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5)
|
|
smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5)
|
|
smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
|
|
smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
|
|
shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4)
|
|
mov v20.d[1], v21.d[0]
|
|
sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2
|
|
|
|
|
|
////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4
|
|
////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5)
|
|
|
|
////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5
|
|
|
|
////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values
|
|
//// ////////////// ROW 6 ///////////////////////
|
|
|
|
//// Process first vertical interpolated row
|
|
//// each column is
|
|
|
|
cmp x10, #1 //// if it 17 rows are complete skip
|
|
beq filter_2dvh_skip_row
|
|
uaddl v20.8h, v17.8b, v14.8b //// a0 + a5 (column1,row0)
|
|
movi v31.8b, #5
|
|
umlal v20.8h, v5.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0)
|
|
umlal v20.8h, v8.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0)
|
|
umlsl v20.8h, v2.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0)
|
|
umlsl v20.8h, v11.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0)
|
|
mov v21.d[0], v20.d[1]
|
|
mov v28.d[1], v29.d[0]
|
|
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
|
|
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
|
|
|
|
uaddl v22.8h, v18.8b, v15.8b //// a0 + a5 (column2,row0)
|
|
umlal v22.8h, v6.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0)
|
|
umlal v22.8h, v9.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0)
|
|
umlsl v22.8h, v3.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0)
|
|
umlsl v22.8h, v12.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0)
|
|
mov v23.d[0], v22.d[1]
|
|
|
|
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
|
|
ext v30.8b, v20.8b , v21.8b , #4
|
|
|
|
uaddl v24.8h, v19.8b, v16.8b //// a0 + a5 (column3,row0)
|
|
ext v29.8b, v20.8b , v21.8b , #6
|
|
umlal v24.8h, v7.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0)
|
|
umlal v24.8h, v10.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0)
|
|
umlsl v24.8h, v4.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0)
|
|
umlsl v24.8h, v13.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0)
|
|
mov v25.d[0], v24.d[1]
|
|
|
|
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
|
|
st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
|
|
|
|
sqrshrun v17.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0)
|
|
ext v31.8b, v21.8b , v22.8b , #2
|
|
sqrshrun v18.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0)
|
|
ext v28.8b, v20.8b , v21.8b , #2
|
|
|
|
saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1)
|
|
ext v31.8b, v22.8b , v23.8b , #2
|
|
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set1)
|
|
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set1)
|
|
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1)
|
|
smlsl v26.4s, v21.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1)
|
|
ext v30.8b, v21.8b , v22.8b , #4
|
|
|
|
sqrshrun v19.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0)
|
|
ext v29.8b, v21.8b , v22.8b , #6
|
|
|
|
ext v28.8b, v21.8b , v22.8b , #2
|
|
saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2)
|
|
smlal v20.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set2)
|
|
smlal v20.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set2)
|
|
smlsl v20.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2)
|
|
smlsl v20.4s, v22.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2)
|
|
ext v31.8b, v23.8b , v24.8b , #2
|
|
|
|
ext v17.8b, v17.8b , v18.8b , #2
|
|
ext v18.8b, v18.8b , v19.8b , #2
|
|
ext v19.8b, v19.8b , v19.8b , #2
|
|
|
|
st1 {v17.8b, v18.8b}, [x1], x12 //// store row1 - 1,1/2 grid
|
|
st1 {v19.h}[0], [x11], x12 //// store row1 - 1,1/2 grid
|
|
|
|
ext v30.8b, v22.8b , v23.8b , #4
|
|
ext v29.8b, v22.8b , v23.8b , #6
|
|
|
|
saddl v18.4s, v31.4h, v22.4h //// a0 + a5 (set3)
|
|
ext v28.8b, v22.8b , v23.8b , #2
|
|
smlal v18.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set3)
|
|
smlal v18.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set3)
|
|
smlsl v18.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3)
|
|
smlsl v18.4s, v23.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3)
|
|
ext v31.8b, v24.8b , v25.8b , #2
|
|
|
|
shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2)
|
|
ext v30.8b, v23.8b , v24.8b , #4
|
|
shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1)
|
|
ext v29.8b, v23.8b , v24.8b , #6
|
|
|
|
saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4)
|
|
ext v28.8b, v23.8b , v24.8b , #2
|
|
ext v31.8b, v25.8b , v25.8b , #2
|
|
smlal v26.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set4)
|
|
smlal v26.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set4)
|
|
smlsl v26.4s, v28.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4)
|
|
smlsl v26.4s, v24.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4)
|
|
ext v30.8b, v24.8b , v25.8b , #4
|
|
|
|
saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5)
|
|
ext v29.8b, v24.8b , v25.8b , #6
|
|
|
|
ext v31.8b, v24.8b , v25.8b , #2
|
|
shrn v28.4h, v18.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3)
|
|
|
|
ld1 {v17.8b, v18.8b, v19.8b}, [x0], x3 //// Load next Row data
|
|
smlal v22.4s, v30.4h, v0.h[1] //// a0 + a5 + 20a2 (set5)
|
|
smlal v22.4s, v29.4h, v0.h[1] //// a0 + a5 + 20a2 + 20a3 (set5)
|
|
smlsl v22.4s, v31.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5)
|
|
smlsl v22.4s, v25.4h, v0.h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5)
|
|
shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4)
|
|
mov v20.d[1], v21.d[0]
|
|
sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2
|
|
|
|
mov v28.d[1], v29.d[0]
|
|
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
|
|
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
|
|
|
|
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
|
|
|
|
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
|
|
st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
|
|
|
|
subs x10, x10, #1 ////decrement loop counter
|
|
|
|
bne filter_2dvh_loop
|
|
|
|
|
|
//// Process first vertical interpolated row
|
|
//// each column is
|
|
//// ////////////// ROW 13 ///////////////////////
|
|
|
|
//// Process first vertical interpolated row
|
|
//// each column is
|
|
|
|
// LDMFD sp!,{x10,x11,x12,pc}
|
|
ldp x19, x20, [sp], #16
|
|
pop_v_regs
|
|
ret
|
|
|
|
filter_2dvh_skip_row:
|
|
mov v28.d[1], v29.d[0]
|
|
sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4
|
|
shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5)
|
|
|
|
sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5
|
|
|
|
st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
|
|
st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values
|
|
// LDMFD sp!,{x10,x11,x12,pc}
|
|
ldp x19, x20, [sp], #16
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
///*****************************************
|