You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1241 lines
33 KiB
1241 lines
33 KiB
///*****************************************************************************
|
|
//*
|
|
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************/
|
|
///**
|
|
// *******************************************************************************
|
|
// * @file
|
|
// * ihevc_itrans_recon_8x8_neon.s
|
|
// *
|
|
// * @brief
|
|
// * contains function definitions for single stage inverse transform
|
|
// *
|
|
// * @author
|
|
// * anand s
|
|
// *
|
|
// * @par list of functions:
|
|
// * - ihevc_itrans_recon_16x16()
|
|
// *
|
|
// * @remarks
|
|
// * none
|
|
// *
|
|
// *******************************************************************************
|
|
//*/
|
|
|
|
///**
|
|
// *******************************************************************************
|
|
// *
|
|
// * @brief
|
|
// * this function performs inverse transform and reconstruction for 8x8
|
|
// * input block
|
|
// *
|
|
// * @par description:
|
|
// * performs inverse transform and adds the prediction data and clips output
|
|
// * to 8 bit
|
|
// *
|
|
// * @param[in] pi2_src
|
|
// * input 16x16 coefficients
|
|
// *
|
|
// * @param[in] pi2_tmp
|
|
// * temporary 16x16 buffer for storing inverse
|
|
// *
|
|
// * transform
|
|
// * 1st stage output
|
|
// *
|
|
// * @param[in] pu1_pred
|
|
// * prediction 16x16 block
|
|
// *
|
|
// * @param[out] pu1_dst
|
|
// * output 8x8 block
|
|
// *
|
|
// * @param[in] src_strd
|
|
// * input stride
|
|
// *
|
|
// * @param[in] pred_strd
|
|
// * prediction stride
|
|
// *
|
|
// * @param[in] dst_strd
|
|
// * output stride
|
|
// *
|
|
// * @param[in] shift
|
|
// * output shift
|
|
// *
|
|
// * @param[in] x12
|
|
// * zero columns in pi2_src
|
|
// *
|
|
// * @returns void
|
|
// *
|
|
// * @remarks
|
|
// * none
|
|
// *
|
|
// *******************************************************************************
|
|
// */
|
|
|
|
//void ihevc_itrans_recon_16x16(word16 *pi2_src,
|
|
// word16 *pi2_tmp,
|
|
// uword8 *pu1_pred,
|
|
// uword8 *pu1_dst,
|
|
// word32 src_strd,
|
|
// word32 pred_strd,
|
|
// word32 dst_strd,
|
|
// word32 x12
|
|
// word32 x11 )
|
|
|
|
//**************variables vs registers*************************
|
|
// x0 => *pi2_src
|
|
// x1 => *pi2_tmp
|
|
// x2 => *pu1_pred
|
|
// x3 => *pu1_dst
|
|
// src_strd
|
|
// pred_strd
|
|
// dst_strd
|
|
// x12
|
|
// x11
|
|
|
|
.text
|
|
.align 4
|
|
|
|
.include "ihevc_neon_macros.s"
|
|
|
|
|
|
|
|
|
|
.set shift_stage1_idct , 7
|
|
.set shift_stage2_idct , 12
|
|
//#define zero_cols x12
|
|
//#define zero_rows x11
|
|
.globl ihevc_itrans_recon_16x16_av8
|
|
|
|
.extern g_ai2_ihevc_trans_16_transpose
|
|
|
|
.type ihevc_itrans_recon_16x16_av8, %function
|
|
|
|
ihevc_itrans_recon_16x16_av8:
|
|
|
|
ldr w11, [sp]
|
|
// stmfd sp!,{x4-x12,x14}
|
|
push_v_regs
|
|
stp x19, x20,[sp,#-16]!
|
|
stp x5, x6,[sp,#-16]!
|
|
// add sp,sp,#40
|
|
|
|
|
|
|
|
// ldr x8,[sp,#4] @ prediction stride
|
|
// ldr x7,[sp,#8] @ destination stride
|
|
mov x6, x4 // src stride
|
|
mov x12, x7
|
|
|
|
|
|
|
|
adrp x14, :got:g_ai2_ihevc_trans_16_transpose
|
|
ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_16_transpose]
|
|
ld1 {v0.4h, v1.4h, v2.4h, v3.4h},[x14] ////d0,d1 are used for storing the constant data
|
|
mov x7,#0xffff
|
|
and x12,x12,x7
|
|
and x11,x11,x7
|
|
lsl x6, x6, #1 // x sizeof(word16)
|
|
add x9,x0,x6, lsl #1 // 2 rows
|
|
|
|
add x10,x6,x6, lsl #1 // 3 rows
|
|
add x5,x6,x6,lsl #2
|
|
mov x7,#0xfff0
|
|
|
|
cmp x12,x7
|
|
bge zero_12cols_decision
|
|
|
|
mov x19,#0xff00
|
|
cmp x12,x19
|
|
bge zero_8cols_decision
|
|
|
|
|
|
|
|
|
|
mov x14,#4
|
|
cmp x11,x7
|
|
sub x20,x6,#0
|
|
neg x20, x20
|
|
csel x10,x20,x10,ge
|
|
|
|
mov x19,#0xff00
|
|
cmp x11,x19
|
|
csel x8, x5, x8,ge
|
|
sub x20,x8,#0
|
|
neg x20, x20
|
|
csel x8,x20,x8,ge
|
|
csel x8, x10, x8,lt
|
|
add x5,x5,x6,lsl #3
|
|
sub x20,x5,#0
|
|
neg x5, x20
|
|
|
|
b first_stage_top_four_bottom_four
|
|
|
|
zero_12cols_decision:
|
|
mov x14,#1
|
|
mov x19,#0xff00
|
|
cmp x11,x19
|
|
csel x8, x5, x8,ge
|
|
csel x8, x10, x8,lt
|
|
add x5,x5,x6,lsl #3
|
|
sub x20,x5,#0
|
|
neg x5, x20
|
|
|
|
b first_stage_top_four_bottom_four
|
|
|
|
zero_8cols_decision:
|
|
mov x14,#2
|
|
mov x8,x5
|
|
sub x20,x8,#0
|
|
neg x8, x20
|
|
mov x19,#0xff00
|
|
cmp x11,x19
|
|
csel x8, x10, x8,lt
|
|
add x5,x5,x6,lsl #3
|
|
sub x20,x5,#0
|
|
neg x5, x20
|
|
cmp x11,x7
|
|
sub x20,x6,#0
|
|
neg x20, x20
|
|
csel x10,x20,x10,ge
|
|
|
|
|
|
b first_stage_top_four_bottom_four
|
|
|
|
|
|
//d0[0]= 64 d2[0]=64
|
|
//d0[1]= 90 d2[1]=57
|
|
//d0[2]= 89 d2[2]=50
|
|
//d0[3]= 87 d2[3]=43
|
|
//d1[0]= 83 d3[0]=36
|
|
//d1[1]= 80 d3[1]=25
|
|
//d1[2]= 75 d3[2]=18
|
|
//d1[3]= 70 d3[3]=9
|
|
|
|
|
|
|
|
first_stage:
|
|
add x0,x0,#8
|
|
add x9,x9,#8
|
|
|
|
first_stage_top_four_bottom_four:
|
|
|
|
ld1 {v10.4h},[x0],x6
|
|
ld1 {v11.4h},[x9],x6
|
|
ld1 {v6.4h},[x0],x10
|
|
ld1 {v7.4h},[x9],x10
|
|
cmp x11,x7
|
|
bge skip_load4rows
|
|
|
|
ld1 {v4.4h},[x0],x6
|
|
ld1 {v5.4h},[x9],x6
|
|
ld1 {v8.4h},[x0],x8
|
|
ld1 {v9.4h},[x9],x8
|
|
|
|
// registers used: q0,q1,q3,q5,q2,q4
|
|
|
|
// d10 =x0
|
|
//d6= x1
|
|
//d11=x2
|
|
//d7=x3
|
|
|
|
skip_load4rows:
|
|
smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
|
|
smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
|
|
smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
|
|
smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
|
|
|
|
smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlal v26.4s, v7.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlsl v30.4s, v7.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
smull v12.4s, v10.4h, v0.h[0]
|
|
smlal v12.4s, v11.4h, v0.h[2]
|
|
smull v14.4s, v10.4h, v0.h[0]
|
|
smlal v14.4s, v11.4h, v1.h[2]
|
|
smull v16.4s, v10.4h, v0.h[0]
|
|
smlal v16.4s, v11.4h, v2.h[2]
|
|
smull v18.4s, v10.4h, v0.h[0]
|
|
smlal v18.4s, v11.4h, v3.h[2]
|
|
|
|
bge skip_last12rows_kernel1
|
|
|
|
|
|
smlal v24.4s, v8.4h, v1.h[1]
|
|
smlal v26.4s, v8.4h, v3.h[3]
|
|
smlsl v28.4s, v8.4h, v1.h[3]
|
|
smlsl v30.4s, v8.4h, v0.h[3]
|
|
|
|
|
|
smlal v24.4s, v9.4h, v1.h[3]
|
|
smlsl v26.4s, v9.4h, v2.h[3]
|
|
smlsl v28.4s, v9.4h, v0.h[3]
|
|
smlal v30.4s, v9.4h, v3.h[3]
|
|
|
|
|
|
|
|
|
|
|
|
smlal v12.4s, v4.4h, v1.h[0]
|
|
smlal v12.4s, v5.4h, v1.h[2]
|
|
smlal v14.4s, v4.4h, v3.h[0]
|
|
smlsl v14.4s, v5.4h, v3.h[2]
|
|
smlsl v16.4s, v4.4h, v3.h[0]
|
|
smlsl v16.4s, v5.4h, v0.h[2]
|
|
smlsl v18.4s, v4.4h, v1.h[0]
|
|
smlsl v18.4s, v5.4h, v2.h[2]
|
|
|
|
//d0[0]= 64 d2[0]=64
|
|
//d0[1]= 90 d2[1]=57
|
|
//d0[2]= 89 d2[2]=50
|
|
//d0[3]= 87 d2[3]=43
|
|
//d1[0]= 83 d3[0]=36
|
|
//d1[1]= 80 d3[1]=25
|
|
//d1[2]= 75 d3[2]=18
|
|
//d1[3]= 70 d3[3]=9
|
|
mov x19,#0xff00
|
|
cmp x11,x19
|
|
bge skip_last12rows_kernel1
|
|
|
|
|
|
ld1 {v10.4h},[x0],x6
|
|
ld1 {v11.4h},[x9],x6
|
|
ld1 {v6.4h},[x0],x10
|
|
ld1 {v7.4h},[x9],x10
|
|
ld1 {v4.4h},[x0],x6
|
|
ld1 {v5.4h},[x9],x6
|
|
ld1 {v8.4h},[x0],x5
|
|
ld1 {v9.4h},[x9],x5
|
|
|
|
|
|
|
|
|
|
smlal v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0)
|
|
smlsl v26.4s, v6.4h, v1.h[1] //// y1 * cos3(part of b1)
|
|
smlsl v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2)
|
|
smlal v30.4s, v6.4h, v0.h[1] //// y1 * sin1(part of b3)
|
|
|
|
smlal v24.4s, v7.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlal v28.4s, v7.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlal v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
|
|
|
|
|
|
smlal v24.4s, v8.4h, v3.h[1]
|
|
smlsl v26.4s, v8.4h, v1.h[3]
|
|
smlal v28.4s, v8.4h, v0.h[1]
|
|
smlsl v30.4s, v8.4h, v1.h[1]
|
|
|
|
|
|
smlal v24.4s, v9.4h, v3.h[3]
|
|
smlsl v26.4s, v9.4h, v3.h[1]
|
|
smlal v28.4s, v9.4h, v2.h[3]
|
|
smlsl v30.4s, v9.4h, v2.h[1]
|
|
|
|
|
|
|
|
|
|
|
|
smlal v12.4s, v10.4h, v0.h[0]
|
|
smlal v12.4s, v11.4h, v2.h[2]
|
|
smlal v12.4s, v4.4h, v3.h[0]
|
|
smlal v12.4s, v5.4h, v3.h[2]
|
|
|
|
|
|
|
|
|
|
smlsl v14.4s, v10.4h, v0.h[0]
|
|
smlsl v14.4s, v11.4h, v0.h[2]
|
|
smlsl v14.4s, v4.4h, v1.h[0]
|
|
smlsl v14.4s, v5.4h, v2.h[2]
|
|
|
|
|
|
smlsl v16.4s, v10.4h, v0.h[0]
|
|
smlal v16.4s, v11.4h, v3.h[2]
|
|
smlal v16.4s, v4.4h, v1.h[0]
|
|
smlal v16.4s, v5.4h, v1.h[2]
|
|
|
|
|
|
smlal v18.4s, v10.4h, v0.h[0]
|
|
smlal v18.4s, v11.4h, v1.h[2]
|
|
smlsl v18.4s, v4.4h, v3.h[0]
|
|
smlsl v18.4s, v5.4h, v0.h[2]
|
|
|
|
skip_last12rows_kernel1:
|
|
add v20.4s, v12.4s , v24.4s
|
|
sub v22.4s, v12.4s , v24.4s
|
|
|
|
add v12.4s, v14.4s , v26.4s
|
|
sub v24.4s, v14.4s , v26.4s
|
|
|
|
add v14.4s, v16.4s , v28.4s
|
|
sub v26.4s, v16.4s , v28.4s
|
|
|
|
|
|
add v16.4s, v18.4s , v30.4s
|
|
sub v28.4s, v18.4s , v30.4s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sqrshrn v30.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v19.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v31.4h, v14.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v18.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v12.4h, v12.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v15.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v13.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v14.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
|
|
|
|
st1 {v30.4h, v31.4h},[x1],#16
|
|
st1 {v18.4h, v19.4h},[x1],#16
|
|
sub x1,x1,#32
|
|
|
|
bge skip_stage1_kernel_load
|
|
|
|
first_stage_middle_eight:
|
|
|
|
|
|
|
|
ld1 {v10.4h},[x0],x6
|
|
ld1 {v11.4h},[x9],x6
|
|
ld1 {v6.4h},[x0],x10
|
|
ld1 {v7.4h},[x9],x10
|
|
ld1 {v4.4h},[x0],x6
|
|
ld1 {v5.4h},[x9],x6
|
|
ld1 {v8.4h},[x0],x8
|
|
ld1 {v9.4h},[x9],x8
|
|
|
|
|
|
skip_stage1_kernel_load:
|
|
smull v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0)
|
|
smull v26.4s, v6.4h, v2.h[3] //// y1 * cos3(part of b1)
|
|
smull v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2)
|
|
smull v30.4s, v6.4h, v3.h[3] //// y1 * sin1(part of b3)
|
|
|
|
smlsl v24.4s, v7.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlsl v28.4s, v7.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlsl v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
smull v22.4s, v10.4h, v0.h[0]
|
|
smlsl v22.4s, v11.4h, v3.h[2]
|
|
smull v20.4s, v10.4h, v0.h[0]
|
|
smlsl v20.4s, v11.4h, v2.h[2]
|
|
smull v16.4s, v10.4h, v0.h[0]
|
|
smlsl v16.4s, v11.4h, v1.h[2]
|
|
smull v18.4s, v10.4h, v0.h[0]
|
|
smlsl v18.4s, v11.4h, v0.h[2]
|
|
|
|
|
|
cmp x11,x7
|
|
bge skip_last12rows_kernel2
|
|
|
|
smlsl v24.4s, v8.4h, v3.h[1]
|
|
smlal v26.4s, v8.4h, v2.h[1]
|
|
smlal v28.4s, v8.4h, v0.h[1]
|
|
smlal v30.4s, v8.4h, v2.h[3]
|
|
|
|
|
|
smlal v24.4s, v9.4h, v0.h[1]
|
|
smlal v26.4s, v9.4h, v3.h[1]
|
|
smlsl v28.4s, v9.4h, v1.h[1]
|
|
smlsl v30.4s, v9.4h, v2.h[1]
|
|
|
|
|
|
|
|
smlsl v22.4s, v4.4h, v1.h[0]
|
|
smlal v22.4s, v5.4h, v2.h[2]
|
|
smlsl v20.4s, v4.4h, v3.h[0]
|
|
smlal v20.4s, v5.4h, v0.h[2]
|
|
smlal v16.4s, v4.4h, v3.h[0]
|
|
smlal v16.4s, v5.4h, v3.h[2]
|
|
smlal v18.4s, v4.4h, v1.h[0]
|
|
smlsl v18.4s, v5.4h, v1.h[2]
|
|
|
|
//d0[0]= 64 d2[0]=64
|
|
//d0[1]= 90 d2[1]=57
|
|
//d0[2]= 89 d2[2]=50
|
|
//d0[3]= 87 d2[3]=43
|
|
//d1[0]= 83 d3[0]=36
|
|
//d1[1]= 80 d3[1]=25
|
|
//d1[2]= 75 d3[2]=18
|
|
//d1[3]= 70 d3[3]=9
|
|
mov x19,#0xff00
|
|
cmp x11,x19
|
|
bge skip_last12rows_kernel2
|
|
|
|
ld1 {v10.4h},[x0],x6
|
|
ld1 {v11.4h},[x9],x6
|
|
ld1 {v6.4h},[x0],x10
|
|
ld1 {v7.4h},[x9],x10
|
|
ld1 {v4.4h},[x0],x6
|
|
ld1 {v5.4h},[x9],x6
|
|
ld1 {v8.4h},[x0],x5
|
|
ld1 {v9.4h},[x9],x5
|
|
|
|
|
|
smlsl v24.4s, v6.4h, v3.h[3] //// y1 * cos1(part of b0)
|
|
smlsl v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
|
|
smlal v28.4s, v6.4h, v2.h[3] //// y1 * sin3(part of b2)
|
|
smlal v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
|
|
|
|
smlsl v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlal v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
|
|
|
|
smlal v24.4s, v8.4h, v2.h[3]
|
|
smlal v26.4s, v8.4h, v3.h[3]
|
|
smlsl v28.4s, v8.4h, v2.h[1]
|
|
smlal v30.4s, v8.4h, v0.h[3]
|
|
|
|
|
|
smlal v24.4s, v9.4h, v1.h[3]
|
|
smlsl v26.4s, v9.4h, v1.h[1]
|
|
smlal v28.4s, v9.4h, v0.h[3]
|
|
smlsl v30.4s, v9.4h, v0.h[1]
|
|
|
|
|
|
|
|
|
|
smlal v22.4s, v10.4h, v0.h[0]
|
|
smlsl v22.4s, v11.4h, v1.h[2]
|
|
smlsl v22.4s, v4.4h, v3.h[0]
|
|
smlal v22.4s, v5.4h, v0.h[2]
|
|
|
|
|
|
|
|
smlsl v20.4s, v10.4h, v0.h[0]
|
|
smlsl v20.4s, v11.4h, v3.h[2]
|
|
smlal v20.4s, v4.4h, v1.h[0]
|
|
smlsl v20.4s, v5.4h, v1.h[2]
|
|
|
|
|
|
smlsl v16.4s, v10.4h, v0.h[0]
|
|
smlal v16.4s, v11.4h, v0.h[2]
|
|
smlsl v16.4s, v4.4h, v1.h[0]
|
|
smlal v16.4s, v5.4h, v2.h[2]
|
|
|
|
|
|
|
|
smlal v18.4s, v10.4h, v0.h[0]
|
|
smlsl v18.4s, v11.4h, v2.h[2]
|
|
smlal v18.4s, v4.4h, v3.h[0]
|
|
smlsl v18.4s, v5.4h, v3.h[2]
|
|
|
|
skip_last12rows_kernel2:
|
|
|
|
add v4.4s, v22.4s , v24.4s
|
|
sub v22.4s, v22.4s , v24.4s
|
|
|
|
add v6.4s, v20.4s , v26.4s
|
|
sub v24.4s, v20.4s , v26.4s
|
|
|
|
add v10.4s, v16.4s , v28.4s
|
|
sub v26.4s, v16.4s , v28.4s
|
|
|
|
|
|
add v16.4s, v18.4s , v30.4s
|
|
sub v28.4s, v18.4s , v30.4s
|
|
|
|
|
|
sqrshrn v18.4h, v4.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v31.4h, v22.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v19.4h, v10.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v30.4h, v26.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v20.4h, v6.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v23.4h, v24.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v21.4h, v16.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v22.4h, v28.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
|
|
|
|
|
|
// registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ld1 {v4.4h, v5.4h},[x1],#16
|
|
ld1 {v8.4h, v9.4h},[x1],#16
|
|
sub x1,x1,#32
|
|
|
|
//d4=x0
|
|
//d12=x1
|
|
//d5=x2
|
|
//d13=x3
|
|
|
|
//d18=x4
|
|
//d20=x5
|
|
//d19=x6
|
|
//d21=x7
|
|
|
|
//d22=x8
|
|
//d30=x9
|
|
//d23=x10
|
|
//d31=x11
|
|
|
|
//d14=x12
|
|
//d8=x13
|
|
//d15=x14
|
|
//d9=x15
|
|
|
|
umov x15,v26.d[0]
|
|
umov x16,v27.d[0]
|
|
umov x19,v28.d[0]
|
|
umov x20,v29.d[0]
|
|
|
|
trn1 v26.4h, v4.4h, v12.4h
|
|
trn2 v27.4h, v4.4h, v12.4h
|
|
trn1 v28.4h, v5.4h, v13.4h
|
|
trn2 v29.4h, v5.4h, v13.4h
|
|
|
|
trn1 v4.2s, v26.2s, v28.2s
|
|
trn2 v5.2s, v26.2s, v28.2s
|
|
trn1 v12.2s, v27.2s, v29.2s
|
|
trn2 v13.2s, v27.2s, v29.2s
|
|
|
|
trn1 v26.4h, v18.4h, v20.4h
|
|
trn2 v27.4h, v18.4h, v20.4h
|
|
trn1 v28.4h, v19.4h, v21.4h
|
|
trn2 v29.4h, v19.4h, v21.4h
|
|
|
|
trn1 v18.2s, v26.2s, v28.2s
|
|
trn2 v19.2s, v26.2s, v28.2s
|
|
trn1 v20.2s, v27.2s, v29.2s
|
|
trn2 v21.2s, v27.2s, v29.2s
|
|
|
|
trn1 v26.4h, v22.4h, v30.4h
|
|
trn2 v27.4h, v22.4h, v30.4h
|
|
trn1 v28.4h, v23.4h, v31.4h
|
|
trn2 v29.4h, v23.4h, v31.4h
|
|
|
|
trn1 v22.2s, v26.2s, v28.2s
|
|
trn2 v23.2s, v26.2s, v28.2s
|
|
trn1 v30.2s, v27.2s, v29.2s
|
|
trn2 v31.2s, v27.2s, v29.2s
|
|
|
|
trn1 v26.4h, v14.4h, v8.4h
|
|
trn2 v27.4h, v14.4h, v8.4h
|
|
trn1 v28.4h, v15.4h, v9.4h
|
|
trn2 v29.4h, v15.4h, v9.4h
|
|
|
|
trn1 v14.2s, v26.2s, v28.2s
|
|
trn2 v15.2s, v26.2s, v28.2s
|
|
trn1 v8.2s, v27.2s, v29.2s
|
|
trn2 v9.2s, v27.2s, v29.2s
|
|
|
|
mov v26.d[0],x15
|
|
mov v27.d[0],x16
|
|
mov v28.d[0],x19
|
|
mov v29.d[0],x20
|
|
|
|
// d4 =x0 1- 4 values
|
|
// d5 =x2 1- 4 values
|
|
// d12=x1 1- 4 values
|
|
// d13=x3 1- 4 values
|
|
|
|
// d18 =x0 5- 8 values
|
|
// d19 =x2 5- 8 values
|
|
// d20=x1 5- 8 values
|
|
// d21=x3 5- 8 values
|
|
|
|
// d22 =x0 9- 12 values
|
|
// d23 =x2 9- 12 values
|
|
// d30=x1 9- 12 values
|
|
// d31=x3 9- 12 values
|
|
|
|
// d14 =x0 13-16 values
|
|
// d15 =x2 13- 16 values
|
|
// d8=x1 13- 16 values
|
|
// d9=x3 13- 16 values
|
|
|
|
|
|
st1 { v4.4h, v5.4h},[x1],#16
|
|
st1 { v12.4h, v13.4h},[x1],#16
|
|
|
|
st1 { v18.4h, v19.4h},[x1],#16
|
|
st1 { v20.4h, v21.4h},[x1],#16
|
|
st1 { v22.4h, v23.4h},[x1],#16
|
|
st1 { v30.4h, v31.4h},[x1],#16
|
|
st1 { v14.4h, v15.4h},[x1],#16
|
|
st1 { v8.4h, v9.4h},[x1],#16
|
|
|
|
|
|
subs x14,x14,#1
|
|
bne first_stage
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mov x6,x7
|
|
|
|
ldp x8, x7,[sp],#16
|
|
|
|
mov x10,#16
|
|
|
|
cmp x12,x6
|
|
sub x20,x1,#128
|
|
csel x1, x20, x1,ge
|
|
bge label1
|
|
|
|
mov x19,#0xff00
|
|
cmp x12,x19
|
|
sub x20,x1,#256
|
|
csel x1, x20, x1,ge
|
|
bge label_2
|
|
|
|
sub x1,x1,#512
|
|
sub x20,x10,#0
|
|
neg x10, x20
|
|
|
|
label_2:
|
|
add x9,x1,#128
|
|
add x11,x9,#128
|
|
add x0,x11,#128
|
|
|
|
|
|
|
|
label1:
|
|
// mov x6,x1
|
|
|
|
|
|
mov x14,#4
|
|
add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
|
|
add x5,x8,x8, lsl #1 //
|
|
// add x0,x3,x7, lsl #1 @ x0 points to 3rd row of dest data
|
|
// add x10,x7,x7, lsl #1 @
|
|
|
|
|
|
|
|
|
|
second_stage:
|
|
ld1 {v10.4h, v11.4h},[x1],#16
|
|
ld1 {v6.4h, v7.4h},[x1],x10
|
|
cmp x12,x6
|
|
bge second_stage_process
|
|
ld1 {v4.4h, v5.4h},[x9],#16
|
|
ld1 {v8.4h, v9.4h},[x9],x10
|
|
|
|
second_stage_process:
|
|
|
|
|
|
smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
|
|
smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
|
|
smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
|
|
smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
|
|
|
|
smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlal v26.4s, v7.4h, v2.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlsl v30.4s, v7.4h, v2.h[3] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
|
|
|
|
smull v12.4s, v10.4h, v0.h[0]
|
|
smlal v12.4s, v11.4h, v0.h[2]
|
|
smull v14.4s, v10.4h, v0.h[0]
|
|
smlal v14.4s, v11.4h, v1.h[2]
|
|
smull v16.4s, v10.4h, v0.h[0]
|
|
smlal v16.4s, v11.4h, v2.h[2]
|
|
smull v18.4s, v10.4h, v0.h[0]
|
|
smlal v18.4s, v11.4h, v3.h[2]
|
|
|
|
bge skip_last8rows_stage2_kernel1
|
|
|
|
smlal v24.4s, v8.4h, v1.h[1]
|
|
smlal v26.4s, v8.4h, v3.h[3]
|
|
smlsl v28.4s, v8.4h, v1.h[3]
|
|
smlsl v30.4s, v8.4h, v0.h[3]
|
|
|
|
|
|
smlal v24.4s, v9.4h, v1.h[3]
|
|
smlsl v26.4s, v9.4h, v2.h[3]
|
|
smlsl v28.4s, v9.4h, v0.h[3]
|
|
smlal v30.4s, v9.4h, v3.h[3]
|
|
|
|
|
|
smlal v12.4s, v4.4h, v1.h[0]
|
|
smlal v12.4s, v5.4h, v1.h[2]
|
|
smlal v14.4s, v4.4h, v3.h[0]
|
|
smlsl v14.4s, v5.4h, v3.h[2]
|
|
smlsl v16.4s, v4.4h, v3.h[0]
|
|
smlsl v16.4s, v5.4h, v0.h[2]
|
|
smlsl v18.4s, v4.4h, v1.h[0]
|
|
smlsl v18.4s, v5.4h, v2.h[2]
|
|
|
|
mov x19,#0xff00
|
|
cmp x12,x19
|
|
bge skip_last8rows_stage2_kernel1
|
|
|
|
|
|
ld1 {v10.4h, v11.4h},[x11],#16
|
|
ld1 {v6.4h, v7.4h},[x11],x10
|
|
ld1 {v4.4h, v5.4h},[x0],#16
|
|
ld1 {v8.4h, v9.4h},[x0],x10
|
|
|
|
|
|
|
|
|
|
|
|
smlal v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0)
|
|
smlsl v26.4s, v6.4h, v1.h[1] //// y1 * cos3(part of b1)
|
|
smlsl v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2)
|
|
smlal v30.4s, v6.4h, v0.h[1] //// y1 * sin1(part of b3)
|
|
|
|
smlal v24.4s, v7.4h, v2.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlal v28.4s, v7.4h, v2.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlal v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
|
|
|
|
|
|
smlal v24.4s, v8.4h, v3.h[1]
|
|
smlsl v26.4s, v8.4h, v1.h[3]
|
|
smlal v28.4s, v8.4h, v0.h[1]
|
|
smlsl v30.4s, v8.4h, v1.h[1]
|
|
|
|
|
|
smlal v24.4s, v9.4h, v3.h[3]
|
|
smlsl v26.4s, v9.4h, v3.h[1]
|
|
smlal v28.4s, v9.4h, v2.h[3]
|
|
smlsl v30.4s, v9.4h, v2.h[1]
|
|
|
|
|
|
|
|
|
|
|
|
smlal v12.4s, v10.4h, v0.h[0]
|
|
smlal v12.4s, v11.4h, v2.h[2]
|
|
smlal v12.4s, v4.4h, v3.h[0]
|
|
smlal v12.4s, v5.4h, v3.h[2]
|
|
|
|
|
|
|
|
|
|
smlsl v14.4s, v10.4h, v0.h[0]
|
|
smlsl v14.4s, v11.4h, v0.h[2]
|
|
smlsl v14.4s, v4.4h, v1.h[0]
|
|
smlsl v14.4s, v5.4h, v2.h[2]
|
|
|
|
|
|
smlsl v16.4s, v10.4h, v0.h[0]
|
|
smlal v16.4s, v11.4h, v3.h[2]
|
|
smlal v16.4s, v4.4h, v1.h[0]
|
|
smlal v16.4s, v5.4h, v1.h[2]
|
|
|
|
|
|
smlal v18.4s, v10.4h, v0.h[0]
|
|
smlal v18.4s, v11.4h, v1.h[2]
|
|
smlsl v18.4s, v4.4h, v3.h[0]
|
|
smlsl v18.4s, v5.4h, v0.h[2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
skip_last8rows_stage2_kernel1:
|
|
|
|
|
|
|
|
add v20.4s, v12.4s , v24.4s
|
|
sub v22.4s, v12.4s , v24.4s
|
|
|
|
add v12.4s, v14.4s , v26.4s
|
|
sub v24.4s, v14.4s , v26.4s
|
|
|
|
add v14.4s, v16.4s , v28.4s
|
|
sub v26.4s, v16.4s , v28.4s
|
|
|
|
|
|
add v16.4s, v18.4s , v30.4s
|
|
sub v28.4s, v18.4s , v30.4s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sqrshrn v30.4h, v20.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v19.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v31.4h, v14.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v18.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v12.4h, v12.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v15.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v13.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v14.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
|
|
|
|
bge skip_stage2_kernel_load
|
|
|
|
//q2,q4,q6,q7 is used
|
|
ld1 {v10.4h, v11.4h},[x1],#16
|
|
ld1 {v6.4h, v7.4h},[x1],#16
|
|
ld1 {v4.4h, v5.4h},[x9],#16
|
|
ld1 {v8.4h, v9.4h},[x9],#16
|
|
skip_stage2_kernel_load:
|
|
sub x1,x1,#32
|
|
st1 {v30.4h, v31.4h},[x1],#16
|
|
st1 {v18.4h, v19.4h},[x1],#16
|
|
sub x1,x1,#32
|
|
|
|
smull v24.4s, v6.4h, v2.h[1] //// y1 * cos1(part of b0)
|
|
smull v26.4s, v6.4h, v2.h[3] //// y1 * cos3(part of b1)
|
|
smull v28.4s, v6.4h, v3.h[1] //// y1 * sin3(part of b2)
|
|
smull v30.4s, v6.4h, v3.h[3] //// y1 * sin1(part of b3)
|
|
|
|
smlsl v24.4s, v7.4h, v1.h[1] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlsl v26.4s, v7.4h, v0.h[1] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlsl v28.4s, v7.4h, v1.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlsl v30.4s, v7.4h, v3.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
|
|
|
|
smull v22.4s, v10.4h, v0.h[0]
|
|
smlsl v22.4s, v11.4h, v3.h[2]
|
|
smull v20.4s, v10.4h, v0.h[0]
|
|
smlsl v20.4s, v11.4h, v2.h[2]
|
|
smull v16.4s, v10.4h, v0.h[0]
|
|
smlsl v16.4s, v11.4h, v1.h[2]
|
|
smull v18.4s, v10.4h, v0.h[0]
|
|
smlsl v18.4s, v11.4h, v0.h[2]
|
|
|
|
|
|
|
|
cmp x12,x6
|
|
bge skip_last8rows_stage2_kernel2
|
|
|
|
|
|
smlsl v24.4s, v8.4h, v3.h[1]
|
|
smlal v26.4s, v8.4h, v2.h[1]
|
|
smlal v28.4s, v8.4h, v0.h[1]
|
|
smlal v30.4s, v8.4h, v2.h[3]
|
|
|
|
|
|
smlal v24.4s, v9.4h, v0.h[1]
|
|
smlal v26.4s, v9.4h, v3.h[1]
|
|
smlsl v28.4s, v9.4h, v1.h[1]
|
|
smlsl v30.4s, v9.4h, v2.h[1]
|
|
|
|
|
|
|
|
smlsl v22.4s, v4.4h, v1.h[0]
|
|
smlal v22.4s, v5.4h, v2.h[2]
|
|
smlsl v20.4s, v4.4h, v3.h[0]
|
|
smlal v20.4s, v5.4h, v0.h[2]
|
|
smlal v16.4s, v4.4h, v3.h[0]
|
|
smlal v16.4s, v5.4h, v3.h[2]
|
|
smlal v18.4s, v4.4h, v1.h[0]
|
|
smlsl v18.4s, v5.4h, v1.h[2]
|
|
mov x19,#0xff00
|
|
cmp x12,x19
|
|
bge skip_last8rows_stage2_kernel2
|
|
|
|
ld1 {v10.4h, v11.4h},[x11],#16
|
|
ld1 {v6.4h, v7.4h},[x11],#16
|
|
ld1 {v4.4h, v5.4h},[x0],#16
|
|
ld1 {v8.4h, v9.4h},[x0],#16
|
|
|
|
smlsl v24.4s, v6.4h, v3.h[3] //// y1 * cos1(part of b0)
|
|
smlsl v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
|
|
smlal v28.4s, v6.4h, v2.h[3] //// y1 * sin3(part of b2)
|
|
smlal v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
|
|
|
|
smlsl v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlal v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlal v28.4s, v7.4h, v3.h[3] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
|
|
|
|
smlal v24.4s, v8.4h, v2.h[3]
|
|
smlal v26.4s, v8.4h, v3.h[3]
|
|
smlsl v28.4s, v8.4h, v2.h[1]
|
|
smlal v30.4s, v8.4h, v0.h[3]
|
|
|
|
|
|
smlal v24.4s, v9.4h, v1.h[3]
|
|
smlsl v26.4s, v9.4h, v1.h[1]
|
|
smlal v28.4s, v9.4h, v0.h[3]
|
|
smlsl v30.4s, v9.4h, v0.h[1]
|
|
|
|
|
|
|
|
|
|
smlal v22.4s, v10.4h, v0.h[0]
|
|
smlsl v22.4s, v11.4h, v1.h[2]
|
|
smlsl v22.4s, v4.4h, v3.h[0]
|
|
smlal v22.4s, v5.4h, v0.h[2]
|
|
|
|
|
|
|
|
smlsl v20.4s, v10.4h, v0.h[0]
|
|
smlsl v20.4s, v11.4h, v3.h[2]
|
|
smlal v20.4s, v4.4h, v1.h[0]
|
|
smlsl v20.4s, v5.4h, v1.h[2]
|
|
|
|
|
|
smlsl v16.4s, v10.4h, v0.h[0]
|
|
smlal v16.4s, v11.4h, v0.h[2]
|
|
smlsl v16.4s, v4.4h, v1.h[0]
|
|
smlal v16.4s, v5.4h, v2.h[2]
|
|
|
|
|
|
|
|
smlal v18.4s, v10.4h, v0.h[0]
|
|
smlsl v18.4s, v11.4h, v2.h[2]
|
|
smlal v18.4s, v4.4h, v3.h[0]
|
|
smlsl v18.4s, v5.4h, v3.h[2]
|
|
|
|
|
|
skip_last8rows_stage2_kernel2:
|
|
|
|
|
|
|
|
add v4.4s, v22.4s , v24.4s
|
|
sub v22.4s, v22.4s , v24.4s
|
|
|
|
add v6.4s, v20.4s , v26.4s
|
|
sub v24.4s, v20.4s , v26.4s
|
|
|
|
add v10.4s, v16.4s , v28.4s
|
|
sub v26.4s, v16.4s , v28.4s
|
|
|
|
|
|
add v16.4s, v18.4s , v30.4s
|
|
sub v28.4s, v18.4s , v30.4s
|
|
|
|
|
|
sqrshrn v18.4h, v4.4s,#shift_stage2_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v31.4h, v22.4s,#shift_stage2_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v19.4h, v10.4s,#shift_stage2_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v30.4h, v26.4s,#shift_stage2_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v20.4h, v6.4s,#shift_stage2_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v23.4h, v24.4s,#shift_stage2_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v21.4h, v16.4s,#shift_stage2_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
|
|
sqrshrn v22.4h, v28.4s,#shift_stage2_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
|
|
|
|
ld1 {v4.4h, v5.4h},[x1],#16
|
|
ld1 {v8.4h, v9.4h},[x1],#16
|
|
|
|
|
|
|
|
// registers used: {q2,q4,q6,q7}, {q9,q15,q10,q11}
|
|
|
|
//d4=x0
|
|
//d12=x1
|
|
//d5=x2
|
|
//d13=x3
|
|
|
|
//d18=x4
|
|
//d20=x5
|
|
//d19=x6
|
|
//d21=x7
|
|
|
|
//d22=x8
|
|
//d30=x9
|
|
//d23=x10
|
|
//d31=x11
|
|
|
|
//d14=x12
|
|
//d8=x13
|
|
//d15=x14
|
|
//d9=x15
|
|
|
|
umov x15,v26.d[0]
|
|
umov x16,v27.d[0]
|
|
umov x19,v28.d[0]
|
|
umov x20,v29.d[0]
|
|
|
|
trn1 v26.4h, v4.4h, v12.4h
|
|
trn2 v27.4h, v4.4h, v12.4h
|
|
trn1 v28.4h, v5.4h, v13.4h
|
|
trn2 v29.4h, v5.4h, v13.4h
|
|
|
|
trn1 v4.2s, v26.2s, v28.2s
|
|
trn2 v5.2s, v26.2s, v28.2s
|
|
trn1 v12.2s, v27.2s, v29.2s
|
|
trn2 v13.2s, v27.2s, v29.2s
|
|
|
|
trn1 v26.4h, v18.4h, v20.4h
|
|
trn2 v27.4h, v18.4h, v20.4h
|
|
trn1 v28.4h, v19.4h, v21.4h
|
|
trn2 v29.4h, v19.4h, v21.4h
|
|
|
|
trn1 v18.2s, v26.2s, v28.2s
|
|
trn2 v19.2s, v26.2s, v28.2s
|
|
trn1 v20.2s, v27.2s, v29.2s
|
|
trn2 v21.2s, v27.2s, v29.2s
|
|
|
|
trn1 v26.4h, v22.4h, v30.4h
|
|
trn2 v27.4h, v22.4h, v30.4h
|
|
trn1 v28.4h, v23.4h, v31.4h
|
|
trn2 v29.4h, v23.4h, v31.4h
|
|
|
|
trn1 v22.2s, v26.2s, v28.2s
|
|
trn2 v23.2s, v26.2s, v28.2s
|
|
trn1 v30.2s, v27.2s, v29.2s
|
|
trn2 v31.2s, v27.2s, v29.2s
|
|
|
|
trn1 v26.4h, v14.4h, v8.4h
|
|
trn2 v27.4h, v14.4h, v8.4h
|
|
trn1 v28.4h, v15.4h, v9.4h
|
|
trn2 v29.4h, v15.4h, v9.4h
|
|
|
|
trn1 v14.2s, v26.2s, v28.2s
|
|
trn2 v15.2s, v26.2s, v28.2s
|
|
trn1 v8.2s, v27.2s, v29.2s
|
|
trn2 v9.2s, v27.2s, v29.2s
|
|
|
|
mov v26.d[0],x15
|
|
mov v27.d[0],x16
|
|
mov v28.d[0],x19
|
|
mov v29.d[0],x20
|
|
|
|
// d4 =x0 1- 4 values
|
|
// d5 =x2 1- 4 values
|
|
// d12=x1 1- 4 values
|
|
// d13=x3 1- 4 values
|
|
|
|
// d18 =x0 5- 8 values
|
|
// d19 =x2 5- 8 values
|
|
// d20=x1 5- 8 values
|
|
// d21=x3 5- 8 values
|
|
|
|
// d22 =x0 9- 12 values
|
|
// d23 =x2 9- 12 values
|
|
// d30=x1 9- 12 values
|
|
// d31=x3 9- 12 values
|
|
|
|
// d14 =x0 13-16 values
|
|
// d15 =x2 13- 16 values
|
|
// d8=x1 13- 16 values
|
|
// d9=x3 13- 16 values
|
|
|
|
// swapping v5 and v15
|
|
mov v5.d[1],v5.d[0]
|
|
mov v5.d[0],v18.d[0]
|
|
mov v18.d[0],v5.d[1]
|
|
// swapping v23 and v14
|
|
mov v23.d[1],v23.d[0]
|
|
mov v23.d[0],v14.d[0]
|
|
mov v14.d[0],v23.d[1]
|
|
// swapping v13 and v20
|
|
mov v13.d[1],v13.d[0]
|
|
mov v13.d[0],v20.d[0]
|
|
mov v20.d[0],v13.d[1]
|
|
// swapping v31 and v8
|
|
mov v31.d[1],v31.d[0]
|
|
mov v31.d[0],v8.d[0]
|
|
mov v8.d[0],v31.d[1]
|
|
|
|
// q2: x0 1-8 values
|
|
// q11: x0 9-16 values
|
|
// q9 : x2 1-8 values
|
|
// q7 : x2 9-16 values
|
|
// q6 : x1 1- 8 values
|
|
// q10: x3 1-8 values
|
|
// q15: x1 9-16 values
|
|
// q4: x3 9-16 values
|
|
|
|
|
|
// registers free: q8,q14,q12,q13
|
|
|
|
|
|
ld1 {v16.8b, v17.8b},[x2],x8
|
|
ld1 {v28.8b, v29.8b},[x2],x5
|
|
ld1 {v24.8b, v25.8b},[x4],x8
|
|
ld1 {v26.8b, v27.8b},[x4],x5
|
|
|
|
mov v4.d[1] ,v5.d[0]
|
|
mov v22.d[1] ,v23.d[0]
|
|
mov v12.d[1] ,v13.d[0]
|
|
mov v30.d[1] ,v31.d[0]
|
|
mov v18.d[1] ,v19.d[0]
|
|
mov v14.d[1] ,v15.d[0]
|
|
mov v20.d[1] ,v21.d[0]
|
|
mov v8.d[1] ,v9.d[0]
|
|
|
|
uaddw v4.8h, v4.8h , v16.8b
|
|
uaddw v22.8h, v22.8h , v17.8b
|
|
uaddw v12.8h, v12.8h , v28.8b
|
|
uaddw v30.8h, v30.8h , v29.8b
|
|
uaddw v18.8h, v18.8h , v24.8b
|
|
uaddw v14.8h, v14.8h , v25.8b
|
|
uaddw v20.8h, v20.8h , v26.8b
|
|
uaddw v8.8h, v8.8h , v27.8b
|
|
|
|
|
|
sqxtun v16.8b, v4.8h
|
|
sqxtun v17.8b, v22.8h
|
|
sqxtun v28.8b, v12.8h
|
|
sqxtun v29.8b, v30.8h
|
|
sqxtun v24.8b, v18.8h
|
|
sqxtun v25.8b, v14.8h
|
|
sqxtun v26.8b, v20.8h
|
|
sqxtun v27.8b, v8.8h
|
|
|
|
|
|
|
|
st1 {v16.8b, v17.8b},[x3],x7
|
|
st1 {v28.8b, v29.8b},[x3],x7
|
|
st1 {v24.8b, v25.8b},[x3],x7
|
|
st1 {v26.8b, v27.8b},[x3],x7
|
|
|
|
subs x14,x14,#1
|
|
|
|
|
|
|
|
bne second_stage
|
|
|
|
|
|
// sub sp,sp,#40
|
|
// ldmfd sp!,{x4-x12,pc}
|
|
ldp x19, x20,[sp],#16
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|