You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1708 lines
69 KiB

@/******************************************************************************
@ *
@ * Copyright (C) 2018 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
.text
.align 4
@/**
@/*******************************************************************************
@/*
@/* @brief
@/* Residue calculation and Forward Transform for 4x4 block with 8-bit input
@/*
@/* @par Description:
@/* Performs residue calculation by subtracting source and prediction and
@/* followed by forward transform
@/*
@/* @param[in] pu1_src
@/* Input 4x4 pixels
@/*
@/* @param[in] pu1_pred
@/* Prediction data
@/*
@/* @param[in] pi4_tmp
@/* Temporary buffer of size 4x4
@/*
@/* @param[out] pi2_dst
@/* Output 4x4 coefficients
@/*
@/* @param[in] src_strd
@/* Input stride
@/*
@/* @param[in] pred_strd
@/* Prediction Stride
@/*
@/* @param[in] dst_strd
@/* Output Stride
@/*
@/* @param[in] chr_plane
@/* Chroma plane
@/*
@/* @returns Void
@/*
@/* @remarks
@/* None
@/*
@/*******************************************************************************
@/*/
@/**************Variables Vs Registers*****************************************
@ r0 => *pu1_src
@ r1 => *pu1_pred
@ r2 => *pi4_temp
@ r3 => *pi2_dst
@ r4 => src_strd
@ r5 => pred_strd
@ r6 => dst_strd
@ r7 => chroma_plane
.global ihevc_resi_trans_4x4_a9q
ihevc_resi_trans_4x4_a9q:
STMFD sp!, {r4-r7, r14} @ store all the register components from caller function to memory
LDR r4, [sp,#20] @ r4 contains src_strd
LDR r5, [sp,#24] @ r5 contains pred_strd
LDR r6, [sp,#28] @ r6 contains dst_strd
LDR r7, [sp,#32] @ r7 chroma plane
CMP r7, #-1
BEQ NON_INTERLEAVE_LOAD @if flag == NULL_PLANE, use non-interleaving loads
VLD1.64 d0, [r0], r4 @ load row 0 src
VLD1.64 d4, [r0], r4 @ load row 1 src
VLD1.64 d1, [r0], r4 @ load row 2 src
VLD1.64 d5, [r0], r4 @ load row 3 src
VUZP.8 d0, d4 @ de-interleaving unzip instruction to get luma data of pu1_src in d0
VUZP.8 d1, d5 @ de-interleaving unzip instruction to get luma data of pu1_src in d1
VLD1.64 d2, [r1], r5 @ load row 0 pred
VLD1.64 d6, [r1], r5 @ load row 1 pred
VLD1.64 d3, [r1], r5 @ load row 2 pred
VLD1.64 d7, [r1], r5 @ load row 3 pred
VUZP.8 d2, d6 @ de-interleaving unzip instruction to get luma data of pu1_pred in d2
VUZP.8 d3, d7 @ de-interleaving unzip instruction to get luma data of pu1_pred in d3
CMP r7, #0
BEQ LOAD_END
VSWP.8 d0, d4
VSWP.8 d1, d5
VSWP.8 d2, d6
VSWP.8 d3, d7
B LOAD_END
NON_INTERLEAVE_LOAD:
VLD1.U32 d0[0], [r0], r4 @ load row 0 src
VLD1.U32 d0[1], [r0], r4 @ load row 1 src
VLD1.U32 d1[0], [r0], r4 @ load row 2 src
VLD1.U32 d1[1], [r0], r4 @ load row 3 src
VLD1.U32 d2[0], [r1], r5 @ load row 0 pred
VLD1.U32 d2[1], [r1], r5 @ load row 1 pred
VLD1.U32 d3[0], [r1], r5 @ load row 2 pred
VLD1.U32 d3[1], [r1], r5 @ load row 3 pred
LOAD_END:
@ Finding the residue
VSUBL.U8 q2, d0, d2 @ q2 contains 1st 16-bit 8 residues
VSUBL.U8 q3, d1, d3 @ q3 contains 2nd 16-bit 8 residues
@ SAD caculation
VABDL.U8 q12, d0, d2 @ q12 contains absolute differences
VABAL.U8 q12, d1, d3 @ q12 accumulates absolute differences
VADD.U16 d26, d24, d25 @ add d-registers of q12
VPADDL.U16 d27, d26 @ d27 contains 2 32-bit values that have to be added
VPADDL.U32 d28, d27 @ d28 contains 64-bit SAD, only LSB important
VMOV.32 r0, d28[0] @ SAD stored in r0 for return
@ SAD caculation ends
@ Forward transform - step 1
VMOV.I16 d2, #64 @ generate immediate constant in d2 for even row multiplication
VTRN.16 d4, d5 @ 3-step transpose of residue matrix starts
VTRN.16 d6, d7 @ 2nd step of the 3-step matrix transpose
VMOV.I16 d0, #83 @ generate immediate constant in d0 for odd row multiplication
VTRN.32 q2, q3 @ Final step of matrix transpose
VMOV.I16 d1, #36 @ generate immediate constant in d1 for odd row multiplication
VSWP d6, d7 @ vector swap to allow even and odd row calculation using Q registers
VADD.S16 q10, q2, q3 @ q4 has the even array
VSUB.S16 q11, q2, q3 @ q5 has the odd array
VMULL.S16 q12, d20, d2 @ e[0]*64
VMLAL.S16 q12, d21, d2[0] @ row 1 of results: e[0]*64 + e[1]*64
VMULL.S16 q13, d20, d2 @ e[0]*64
VMLSL.S16 q13, d21, d2[0] @ row 3 of results: e[0]*64 - e[1]*64
VMULL.S16 q8, d22, d0 @ o[0]*83
VMLAL.S16 q8, d23, d1[0] @ row 2 of results: o[0]*83 + o[1]*36
VMULL.S16 q9, d22, d1 @ o[0]*36
VMLSL.S16 q9, d23, d0[0] @ row 4 of results: o[0]*36 - o[1]*83
@ Forward transform - step 2
VMOV.I32 d2, #64 @ generate immediate constant in d2 for even row multiplication
VMOV.I32 d0, #83 @ generate immediate constant in d0 for odd row multiplication
VTRN.32 q12, q8 @ 4-step transpose of residue matrix starts
VTRN.32 q13, q9 @ 2nd step of the 4-step matrix transpose
VMOV.I32 d1, #36 @ generate immediate constant in d1 for odd row multiplication
VSWP d25, d26 @ 3rd step of the 4-step matrix transpose
VSWP d17, d18 @ 4th step of the 4-step matrix transpose
VADD.S32 q2, q12, q9 @ e[0]
VADD.S32 q3, q8, q13 @ e[1]
VSUB.S32 q10, q12, q9 @ o[0]
VSUB.S32 q11, q8, q13 @ o[1]
VMUL.S32 q12, q2, d2[0] @ e[0]*64
VMLA.S32 q12, q3, d2[0] @ row 1 of results: e[0]*64 + e[1]*64
VMUL.S32 q13, q2, d2[0] @ e[1]*64
VMLS.S32 q13, q3, d2[0] @ row 3 of results: e[0]*64 - e[1]*64
VMUL.S32 q8, q10, d0[0] @ o[0]*83
VMLA.S32 q8, q11, d1[0] @ row 2 of results: o[0]*83 + o[1]*36
VMUL.S32 q9, q10, d1[0] @ o[0]*36
VMLS.S32 q9, q11, d0[0] @ row 4 of results: o[0]*36 - o[1]*83
VRSHRN.S32 d0, q12, #9 @ (row1 + 256)/512
VRSHRN.S32 d1, q8, #9 @ (row2 + 256)/512
VRSHRN.S32 d2, q13, #9 @ (row3 + 256)/512
VRSHRN.S32 d3, q9, #9 @ (row4 + 256)/512
LSL r7, r6, #1 @ r7 = 2*dst_strd, as pi2_dst contains 2-byte integers
VST1.U16 d0, [r3], r7 @ store 1st row of result
VST1.U16 d1, [r3], r7 @ store 2nd row of result
VST1.U16 d2, [r3], r7 @ store 3rd row of result
VST1.U16 d3, [r3], r7 @ store 4th row of result
LDMFD sp!,{r4-r7,r15} @ Reload the registers from SP
@ Function End
@/**
@*******************************************************************************
@*
@* @brief
@* This function performs residue calculation and forward transform type 1
@* on input pixels
@*
@* @description
@* Performs residue calculation by subtracting source and prediction and
@* followed by forward transform
@*
@* @param[in] pu1_src
@* Input 4x4 pixels
@*
@* @param[in] pu1_pred
@* Prediction data
@*
@* @param[in] pi2_tmp
@* Temporary buffer of size 4x4
@*
@* @param[out] pi2_dst
@* Output 4x4 coefficients
@*
@* @param[in] src_strd
@* Input stride
@*
@* @param[in] pred_strd
@* Prediction Stride
@*
@* @param[in] dst_strd
@* Output Stride
@*
@* @param[in] chr_plane (unused)
@* Chroma plane
@*
@* @returns void
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*/
@ UWORD32 ihevc_resi_trans_4x4_ttype1(UWORD8 *pu1_src,
@ UWORD8 *pu1_pred,
@ WORD32 *pi4_temp,
@ WORD16 *pi2_dst,
@ WORD32 src_strd,
@ WORD32 pred_strd,
@ WORD32 dst_strd
@ WORD32 chroma_plane);
@
@**************Variables Vs Registers*******************************************
@
@ r0 - pu1_src
@ r1 - pu1_pred
@ r2 - pi4_temp
@ r3 - pi2_dst
@
@ [sp] - src_strd
@ [sp+4] - pred_strd
@ [sp+8] - dst_strd
@ [sp+12] - chroma_plane
@
@*******************************************************************************
.global ihevc_resi_trans_4x4_ttype1_a9q
ihevc_resi_trans_4x4_ttype1_a9q:
PUSH {r4}
vpush {d8 - d15}
LDR r2,[sp,#68] @ r2 = src_strd
LDR r4,[sp,#72] @ r4 = pred_strd
VLD1.32 d2[0],[r0],r2 @ Row 1 of source in d2[0]
VLD1.32 d3[0],[r1],r4 @ Row 1 of prediction in d3[0]
VLD1.32 d2[1],[r0],r2 @ Row 2 of source in d2[1]
VLD1.32 d3[1],[r1],r4 @ Row 2 of prediction in d3[1]
VLD1.32 d8[0],[r0],r2 @ Row 3 of source in d8[0]
VABDL.U8 q0,d2,d3 @ Absolute differences of rows 1 and 2 in d0
@ R2:[d11[3] d11[2] d11[1] d11[0]] => Row 2 of residue
VLD1.32 d9[0],[r1],r4 @ Row 3 of prediction in d9[0]
VSUBL.U8 q5,d2,d3 @ R1:[d10[3] d10[2] d10[1] d10[0]] => Row 1 of residue
VLD1.32 d8[1],[r0] @ Row 4 of source in d8[1]
VTRN.16 d10,d11 @ Transpose step 1
VLD1.32 d9[1],[r1] @ Row 4 of prediction in d9[1]
VSUBL.U8 q6,d8,d9 @ R3:[d12[3] d12[2] d12[1] d12[0]] => Row 3 of residue
@ R4:[d13[3] d13[2] d13[1] d13[0]] => Row 4 of residue
VABAL.U8 q0,d8,d9 @ Absolute differences of rows 3 and 4 in d1
VTRN.16 d12,d13 @ Transpose step 2
VTRN.32 q5,q6 @ Transpose step 3, Residue block transposed
@ Columns are in C1:d10, C2:d11, C3:d12 and C4:d13
VADD.S16 d23,d11,d13 @ d23 = C2 + C4
VMOV.I32 d6,#55 @ Constant used for multiplication
VADD.S16 d22,d10,d13 @ d22 = C1 + C4
VADD.U16 d0,d1,d0 @ Accumulating SAD step 1
VMOV.I32 d7,#84 @ Constant used for multiplication
VMULL.S16 q7,d23,d6[0] @ q7 = 55*C2 + 55*C4
VMOV.I32 d4,#74 @ Constant used for multiplication
VMULL.S16 q9,d22,d7[0] @ q9 = 84*C1 + 84*C4
VADD.S16 d16,d10,d11 @ d16 = C1 + C2
VMUL.S16 d12,d12,d4[0] @ d12 = 74*C3
VMOV.I32 d5,#29 @ Constant used for multiplication
VPADDL.U16 d0,d0 @ Accumulating SAD step 2
VSUB.S16 d16,d16,d13 @ d16 = C1 + C2 - C4
VMLAL.S16 q7,d22,d5[0] @ q7 = 29*C1 + 55*C2 + 84*C4
VMLSL.S16 q9,d23,d5[0] @ q9 = 84*C1 - 29*C2 + 55*C4
VMULL.S16 q8,d16,d4[0] @ q8 = 74*C1 + 74*C2 - 74*C4
VPADDL.U32 d0,d0 @ Accumulating SAD step 3, SAD in d0
VSUB.S32 q10,q9,q7 @ q10 = q9 - q7 = 55*C1 - 84*C2 - 29*C4
VMOV.32 r0,d0[0] @ Return SAD value
VRSHR.S32 q8,q8,#1 @ Truncating the 1 bit in q8
VADDW.S16 q7,q7,d12 @ q7 = 29*C1 + 55*C2 + 74*C3 + 84*C4
VSUBW.S16 q9,q9,d12 @ q9 = 84*C1 - 29*C2 - 74*C3 + 55*C4
VADDW.S16 q10,q10,d12 @ q10 = 55*C1 - 84*C2 + 74*C3 - 29*C4
VRSHR.S32 q7,q7,#1 @ Truncating the 1 bit in q7
VRSHR.S32 q9,q9,#1 @ Truncating the 1 bit in q9
VRSHR.S32 q10,q10,#1 @ Truncating the 1 bit in q10
@ Transform stage 1 is in P1:q7, P2:q8, P3:q9 and P4:q10
VTRN.32 q7,q8
VTRN.32 q9,q10
VSWP d15,d18
VSWP d17,d20 @ Residue block transposed
@ Corresponding columns are in S1:q7, S2:q8, S3:q9 and S4:q10
VADD.S32 q13,q7,q8 @ q13 = S1 + S2
VADD.S32 q1,q7,q10 @ q1 = S1 + S4
VADD.S32 q4,q8,q10 @ q4 = S2 + S4
VSUB.S32 q13,q13,q10 @ q13 = S1 + S2 - S4
VMUL.S32 q12,q1,d5[0] @ q12 = 29*S1 + 29*S4
VMUL.S32 q14,q1,d7[0] @ q14 = 84*S1 + 84*S4
VMUL.S32 q13,q13,d4[0] @ q13 = 74*S1 + 74*S2 - 74*S4
VMLA.S32 q12,q4,d6[0] @ q12 = 29*S1 + 55*S2 + 84*S4
VMLS.S32 q14,q4,d5[0] @ q14 = 84*S1 - 29*S2 + 55*S4
VMUL.S32 q9,q9,d4[0] @ q9 = 74*S3
LDR r4,[sp,#76] @ r4 = dst_strd_chr_flag
LSL r4,r4,#1 @ r4 = 2*dst_strd
VRSHRN.S32 d26,q13,#8
VSUB.S32 q15,q14,q12 @ q15 = q14 - q12 = 55*S1 - 84*S2 - 29*S4
VADD.S32 q12,q12,q9 @ q12 = 29*S1 + 55*S2 + 74*S3 + 84*S4
VSUB.S32 q14,q14,q9 @ q14 = 84*S1 - 29*S2 - 74*S3 + 55*S4
VADD.S32 q15,q15,q9 @ q15 = 55*S1 - 84*S2 + 74*S3 - 29*S4
VRSHRN.S32 d24,q12,#8
VRSHRN.S32 d28,q14,#8
VRSHRN.S32 d30,q15,#8 @ Truncating the last 8 bits
@ Transform stage 2 is in U1:d24, U2:d26, U3:d28 and U4:d30
VST1.64 d24,[r3],r4 @ Storing row 1 of transform stage 2
VST1.64 d26,[r3],r4 @ Storing row 2 of transform stage 2
VST1.64 d28,[r3],r4 @ Storing row 3 of transform stage 2
VST1.64 d30,[r3] @ Storing row 4 of transform stage 2
vpop {d8 - d15}
POP {r4}
MOV pc,lr
@/**
@*******************************************************************************
@*
@* @brief
@* This function performs residue calculation and DCT integer forward transform
@* on 8x8 block
@*
@* @description
@* Performs residue calculation by subtracting source and prediction and
@* followed by DCT integer forward transform
@*
@* @param[in] pu1_src
@* Input 4x4 pixels
@*
@* @param[in] pu1_pred
@* Prediction data
@*
@* @param[in] pi2_tmp
@* Temporary buffer of size 8x8
@*
@* @param[out] pi2_dst
@* Output 8x8 coefficients
@*
@* @param[in] src_strd
@* Input stride
@*
@* @param[in] pred_strd
@* Prediction Stride
@*
@* @param[in] dst_strd
@* Output Stride
@*
@* @param[in] chr_plane
@* Chroma plane
@*
@* @returns void
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*/
@ UWORB32 ihevc_resi_trans_8x8(UWORD8 *pu1_src,
@ UWORD8 *pu1_pred,
@ WORB32 *pi4_temp,
@ WORB16 *pi2_dst,
@ WORB32 src_strd,
@ WORB32 pred_strd,
@ WORB32 dst_strd
@ WORB32 chroma_plane);
@
@**************Variables Vs Registers*******************************************
@
@ r0 - pu1_src
@ r1 - pu1_pred
@ r2 - pi4_temp
@ r3 - pi2_dst
@
@ [sp] - src_strd
@ [sp+4] - pred_strd
@ [sp+8] - dst_strd
@ [sp+12] - chroma_plane
@
@*******************************************************************************
.global ihevc_resi_trans_8x8_a9q
ihevc_resi_trans_8x8_a9q:
PUSH {r4,r5}
vpush {d8 - d15}
@ Loading Prediction and Source blocks of size 8x8
LDR r4,[sp,#84] @ r4 = chroma flag
CMP r4,#-1 @ NULL PLANE
BEQ LUMA_LOAD
CMP r4,#1 @ V PLANE
BEQ CHROMA_V_LOAD
@ handling U PLANE
LDR r5,[sp,#72] @ r5 = src_strd
LDR r4,[sp,#76] @ r4 = pred_strd
VLD2.8 {d0,d2},[r1],r4 @ Row 1 of prediction in d0
VLD2.8 {d1,d3},[r0],r5 @ Row 1 of source in d1
VABDL.U8 q15,d1,d0 @ Row 1 of absolute difference in q15
VLD2.8 {d2,d4},[r1],r4 @ Row 2 of prediction in d2
VSUBL.U8 q0,d1,d0 @ Row 1 of residue in q0
VLD2.8 {d3,d5},[r0],r5 @ Row 2 of source in d3
VABDL.U8 q9,d3,d2 @ Row 2 of absolute difference in q9
VLD2.8 {d4,d6},[r1],r4 @ Row 3 of prediction in d4
VSUBL.U8 q1,d3,d2 @ Row 2 of residue in q1
VLD2.8 {d5,d7},[r0],r5 @ Row 3 of source in d5
VABAL.U8 q15,d5,d4 @ Row 3 of absolute difference accumulated in q15
VLD2.8 {d6,d8},[r1],r4 @ Row 4 of prediction in d6
VSUBL.U8 q2,d5,d4 @ Row 3 of residue in q2
VLD2.8 {d7,d9},[r0],r5 @ Row 4 of source in d7
VABAL.U8 q9,d7,d6 @ Row 4 of absolute difference accumulated in q9
VLD2.8 {d8,d10},[r1],r4 @ Row 5 of prediction in d8
VSUBL.U8 q3,d7,d6 @ Row 4 of residue in q3
VLD2.8 {d9,d11},[r0],r5 @ Row 5 of source in d9
VABDL.U8 q10,d9,d8 @ Row 5 of absolute difference in q10
VLD2.8 {d10,d12},[r1],r4 @ Row 6 of prediction in d10
VSUBL.U8 q4,d9,d8 @ Row 5 of residue in q4
VLD2.8 {d11,d13},[r0],r5 @ Row 6 of source in d11
VABAL.U8 q15,d11,d10 @ Row 6 of absolute difference accumulated in q15
VLD2.8 {d12,d14},[r1],r4 @ Row 7 of prediction in d12
VSUBL.U8 q5,d11,d10 @ Row 6 of residue in q5
VLD2.8 {d13,d15},[r0],r5 @ Row 7 of source in d13
VABAL.U8 q9,d13,d12 @ Row 7 of absolute difference accumulated in q9
VLD2.8 {d14,d16},[r1] @ Row 8 of prediction in d14
VSUBL.U8 q6,d13,d12 @ Row 7 of residue in q6
VLD2.8 {d15,d17},[r0] @ Row 8 of source in d15
B LUMA_LOAD_END
CHROMA_V_LOAD:
LDR r5,[sp,#72] @ r5 = src_strd
LDR r4,[sp,#76] @ r4 = pred_strd
VLD2.8 {d0,d2},[r1],r4 @ Row 1 of prediction in d2
VLD2.8 {d1,d3},[r0],r5 @ Row 1 of source in d3
VABDL.U8 q15,d3,d2 @ Row 1 of absolute difference in q15
VLD2.8 {d4,d6},[r1],r4 @ Row 2 of prediction in d6
VSUBL.U8 q0,d3,d2 @ Row 1 of residue in q0
VLD2.8 {d5,d7},[r0],r5 @ Row 2 of source in d7
VABDL.U8 q9,d7,d6 @ Row 2 of absolute difference in q9
VLD2.8 {d8,d10},[r1],r4 @ Row 3 of prediction in d10
VSUBL.U8 q1,d7,d6 @ Row 2 of residue in q1
VLD2.8 {d9,d11},[r0],r5 @ Row 3 of source in d11
VABAL.U8 q15,d11,d10 @ Row 3 of absolute difference accumulated in q15
VLD2.8 {d6,d8},[r1],r4 @ Row 4 of prediction in d8
VSUBL.U8 q2,d11,d10 @ Row 3 of residue in q2
VLD2.8 {d7,d9},[r0],r5 @ Row 4 of source in d9
VABAL.U8 q9,d9,d8 @ Row 4 of absolute difference accumulated in q9
VLD2.8 {d10,d12},[r1],r4 @ Row 5 of prediction in d12
VSUBL.U8 q3,d9,d8 @ Row 4 of residue in q3
VLD2.8 {d11,d13},[r0],r5 @ Row 5 of source in d13
VABDL.U8 q10,d13,d12 @ Row 5 of absolute difference in q10
VLD2.8 {d14,d16},[r1],r4 @ Row 6 of prediction in d16
VSUBL.U8 q4,d13,d12 @ Row 5 of residue in q4
VLD2.8 {d15,d17},[r0],r5 @ Row 6 of source in d17
VABAL.U8 q15,d17,d16 @ Row 6 of absolute difference accumulated in q15
VLD2.8 {d12,d14},[r1],r4 @ Row 7 of prediction in d12
VSUBL.U8 q5,d17,d16 @ Row 6 of residue in q5
VLD2.8 {d13,d15},[r0],r5 @ Row 7 of source in d13
VABAL.U8 q9,d15,d14 @ Row 7 of absolute difference accumulated in q9
VSUBL.U8 q6,d15,d14 @ Row 7 of residue in q6
VLD2.8 {d14,d16},[r1] @ Row 8 of prediction in d14
VLD2.8 {d15,d17},[r0] @ Row 8 of source in d15
VSWP.8 d14,d16
VSWP.8 d15,d17
B LUMA_LOAD_END
LUMA_LOAD:
LDR r5,[sp,#72] @ r5 = src_strd
LDR r4,[sp,#76] @ r4 = pred_strd
VLD1.64 d0,[r1],r4 @ Row 1 of prediction in d0
VLD1.64 d1,[r0],r5 @ Row 1 of source in d1
VABDL.U8 q15,d1,d0 @ Row 1 of absolute difference in q15
VLD1.64 d2,[r1],r4 @ Row 2 of prediction in d2
VSUBL.U8 q0,d1,d0 @ Row 1 of residue in q0
VLD1.64 d3,[r0],r5 @ Row 2 of source in d3
VABDL.U8 q9,d3,d2 @ Row 2 of absolute difference in q9
VLD1.64 d4,[r1],r4 @ Row 3 of prediction in d4
VSUBL.U8 q1,d3,d2 @ Row 2 of residue in q1
VLD1.64 d5,[r0],r5 @ Row 3 of source in d5
VABAL.U8 q15,d5,d4 @ Row 3 of absolute difference accumulated in q15
VLD1.64 d6,[r1],r4 @ Row 4 of prediction in d6
VSUBL.U8 q2,d5,d4 @ Row 3 of residue in q2
VLD1.64 d7,[r0],r5 @ Row 4 of source in d7
VABAL.U8 q9,d7,d6 @ Row 4 of absolute difference accumulated in q9
VLD1.64 d8,[r1],r4 @ Row 5 of prediction in d8
VSUBL.U8 q3,d7,d6 @ Row 4 of residue in q3
VLD1.64 d9,[r0],r5 @ Row 5 of source in d9
VABDL.U8 q10,d9,d8 @ Row 5 of absolute difference in q10
VLD1.64 d10,[r1],r4 @ Row 6 of prediction in d10
VSUBL.U8 q4,d9,d8 @ Row 5 of residue in q4
VLD1.64 d11,[r0],r5 @ Row 6 of source in d11
VABAL.U8 q15,d11,d10 @ Row 6 of absolute difference accumulated in q15
VLD1.64 d12,[r1],r4 @ Row 7 of prediction in d12
VSUBL.U8 q5,d11,d10 @ Row 6 of residue in q5
VLD1.64 d13,[r0],r5 @ Row 7 of source in d13
VABAL.U8 q9,d13,d12 @ Row 7 of absolute difference accumulated in q9
VLD1.64 d14,[r1] @ Row 8 of prediction in d14
VSUBL.U8 q6,d13,d12 @ Row 7 of residue in q6
VLD1.64 d15,[r0] @ Row 8 of source in d15
LUMA_LOAD_END:
@ Transform stage 1
@ Transposing residue matrix
VABAL.U8 q10,d15,d14 @ Row 8 of absolute difference accumulated in q10
VTRN.16 q0,q1 @ Transpose residue matrix step (1a)
VSUBL.U8 q7,d15,d14 @ Row 8 of residue in q7
VTRN.16 q2,q3 @ Transpose residue matrix step (1b)
VTRN.16 q4,q5 @ Transpose residue matrix step (1c)
VTRN.16 q6,q7 @ Transpose residue matrix step (1d)
VTRN.32 q0,q2 @ Transpose residue matrix step (2a)
VTRN.32 q1,q3 @ Transpose residue matrix step (2b)
VADD.U16 q8,q15,q9 @ SAD calculation (1)
VTRN.32 q4,q6 @ Transpose residue matrix step (2c)
VTRN.32 q5,q7 @ Transpose residue matrix step (2d)
VADD.U16 q8,q8,q10 @ SAD calculation (2)
VSWP d1,d8 @ Transpose residue matrix step (3a)
VSWP d3,d10 @ Transpose residue matrix step (3b)
VADD.U16 d16,d16,d17 @ SAD calculation (3)
VSWP d7,d14 @ Transpose residue matrix step (3c)
VSWP d5,d12 @ Transpose residue matrix step (3d)
@ Columns of residue C0-C7 (8x8 matrix) in q0-q7
VPADDL.U16 d16,d16 @ SAD calculation (4)
@ Evaluating first step in Butterfly diagram
VADD.S16 q10,q0,q7 @ q10 = C0 + C7
VADD.S16 q11,q1,q6 @ q11 = C1 + C6
VPADDL.U32 d16,d16 @ SAD calculation (5)
VADD.S16 q12,q2,q5 @ q12 = C2 + C5
VADD.S16 q13,q3,q4 @ q13 = C3 + C4
VSUB.S16 q4,q3,q4 @ q4 = C3 - C4
VSUB.S16 q5,q2,q5 @ q5 = C2 - C5
VSUB.S16 q6,q1,q6 @ q6 = C1 - C6
VSUB.S16 q7,q0,q7 @ q7 = C0 - C7
@ Calculating F0, F2, F4 and F6
VADD.S16 q1,q11,q12 @ q1 = C1 + C2 + C5 + C6
VADD.S16 q2,q10,q13 @ q2 = C0 + C3 + C4 + C7
MOV r4,#50
LSL r4,r4,#16
ADD r4,r4,#18
MOV r5,#89
LSL r5,r5,#16
ADD r5,r5,#75
VMOV d0,r4,r5 @ 16-bit aligned, d0[3] = 89, d0[2] = 75, d0[1] = 50, d0[0]=18
MOV r4,#83
LSL r4,r4,#16
ADD r4,r4,#36
VMOV d1,r4,r4 @ 16-bit aligned, d1[3] = 83, d1[2] = 36, d1[1] = 83, d1[0]=36
VSUB.S16 q10,q10,q13 @ q10 = C0 - C3 - C4 + C7
VSUB.S16 q11,q11,q12 @ q11 = C1 - C2 - C5 + C6
VMOV.32 r0,d16[0] @ SAD calculation (6) : Return value = SAD
VSUB.S16 q3,q2,q1 @ q3 = C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7
VADD.S16 q2,q2,q1 @ q2 = C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7
VMULL.S16 q14,d20,d1[1] @ q14 = [0] of 83*(C0 - C3 - C4 + C7)
VMULL.S16 q15,d21,d1[1] @ q15 = [1] of 83*(C0 - C3 - C4 + C7)
VMULL.S16 q9,d20,d1[0] @ q9 = [0] of 36*(C0 - C3 - C4 + C7)
VMULL.S16 q10,d21,d1[0] @ q10 = [1] of 36*(C0 - C3 - C4 + C7)
VMLAL.S16 q14,d22,d1[0] @ q14 = F2[0] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)
VSHLL.S16 q13,d6,#6 @ q13 = F4[0] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)
VMLAL.S16 q15,d23,d1[0] @ q15 = F2[1] = 83*(C0 - C3 - C4 + C7) + 36*(C1 - C2 - C5 + C6)
VSHLL.S16 q3,d7,#6 @ q3 = F4[1] = 64*(C0 - C1 - C2 + C3 + C4 - C5 - C6 + C7)
VMLSL.S16 q9,d22,d1[1] @ q9 = F6[0] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)
VSHLL.S16 q12,d4,#6 @ q12 = F0[0] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)
VMLSL.S16 q10,d23,d1[1] @ q10 = F6[1] = 36*(C0 - C3 - C4 + C7) - 83*(C1 - C2 - C5 + C6)
VSHLL.S16 q2,d5,#6 @ q2 = F0[1] = 64*(C0 + C1 + C2 + C3 + C4 + C5 + C6 + C7)
@ Calculating F1, F3, F5 and F7
MOV r4,#48
VST1.64 {d24,d25},[r2]! @ Row 1 of transform stage 1 F0[0] stored
VST1.64 {d4,d5},[r2],r4 @ Row 1 of transform stage 1 F0[1] stored
VST1.64 {d28,d29},[r2]! @ Row 3 of transform stage 1 F2[0] stored
VST1.64 {d30,d31},[r2],r4 @ Row 3 of transform stage 1 F2[1] stored
VST1.64 {d26,d27},[r2]! @ Row 5 of transform stage 1 F4[0] stored
VMULL.S16 q1,d14,d0[3] @ q1 = [0] of 89*(C0 - C7)
VMULL.S16 q8,d15,d0[3] @ q8 = [1] of 89*(C0 - C7)
VST1.64 {d6,d7},[r2],r4 @ Row 5 of transform stage 1 F4[1] stored
VMULL.S16 q11,d14,d0[2] @ q11 = [0] of 75*(C0 - C7)
VMULL.S16 q13,d15,d0[2] @ q13 = [1] of 75*(C0 - C7)
VST1.64 {d18,d19},[r2]! @ Row 7 of transform stage 1 F6[0] stored
VMULL.S16 q3,d14,d0[1] @ q3 = [0] of 50*(C0 - C7)
VMULL.S16 q9,d15,d0[1] @ q9 = [1] of 50*(C0 - C7)
VST1.64 {d20,d21},[r2] @ Row 7 of transform stage 1 F6[1] stored
VMULL.S16 q10,d14,d0[0] @ q10 = [0] of 18*(C0 - C7)
VMULL.S16 q7,d15,d0[0] @ q7 = [1] of 18*(C0 - C7)
VMLAL.S16 q1,d12,d0[2] @ q1 = [0] of 89*(C0 - C7) + 75*(C1 - C6)
VMLAL.S16 q8,d13,d0[2] @ q8 = [1] of 89*(C0 - C7) + 75*(C1 - C6)
VMLSL.S16 q11,d12,d0[0] @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6)
VMLSL.S16 q13,d13,d0[0] @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6)
VMLSL.S16 q3,d12,d0[3] @ q3 = [0] of 50*(C0 - C7) - 89*(C1 - C6)
VMLSL.S16 q9,d13,d0[3] @ q9 = [1] of 50*(C0 - C7) - 89*(C1 - C6)
VMLSL.S16 q10,d12,d0[1] @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6)
VMLSL.S16 q7,d13,d0[1] @ q7 = [1] of 18*(C0 - C7) - 50*(C1 - C6)
VMLAL.S16 q1,d10,d0[1] @ q1 = [0] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)
VMLAL.S16 q8,d11,d0[1] @ q8 = [1] of 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5)
VMLSL.S16 q11,d10,d0[3] @ q11 = [0] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)
VMLSL.S16 q13,d11,d0[3] @ q13 = [1] of 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5)
VMLAL.S16 q3,d10,d0[0] @ q3 = [0] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)
VMLAL.S16 q9,d11,d0[0] @ q9 = [1] of 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5)
VMLAL.S16 q10,d10,d0[2] @ q10 = [0] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)
VMLAL.S16 q7,d11,d0[2] @ q7 = [1] of 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5)
VMLAL.S16 q1,d8,d0[0] @ q1 = F1[0] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)
VMLAL.S16 q8,d9,d0[0] @ q8 = F1[1] = 89*(C0 - C7) + 75*(C1 - C6) + 50*(C2 - C5) + 18*(C3 - C4)
VMLSL.S16 q11,d8,d0[1] @ q11 = F3[0] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)
VMLSL.S16 q13,d9,d0[1] @ q13 = F3[1] = 75*(C0 - C7) - 18*(C1 - C6) - 89*(C2 - C5) - 50*(C3 - C4)
SUB r2,r2,#176 @ r2 now points to the second row
VMLAL.S16 q3,d8,d0[2] @ q3 = F5[0] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)
VMLAL.S16 q9,d9,d0[2] @ q9 = F5[1] = 50*(C0 - C7) - 89*(C1 - C6) + 18*(C2 - C5) + 75*(C3 - C4)
VST1.64 {d2,d3},[r2]! @ Row 2 of transform stage 1 F1[0] stored
VMLSL.S16 q10,d8,d0[3] @ q10 = F7[0] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)
VMLSL.S16 q7,d9,d0[3] @ q7 = F7[1] = 18*(C0 - C7) - 50*(C1 - C6) + 75*(C2 - C5) - 89*(C3 - C4)
VST1.64 {d16,d17},[r2],r4 @ Row 2 of transform stage 1 F1[1] stored
VST1.64 {d22,d23},[r2]! @ Row 4 of transform stage 1 F3[0] stored
VST1.64 {d26,d27},[r2],r4 @ Row 4 of transform stage 1 F3[1] stored
VST1.64 {d6,d7},[r2]! @ Row 6 of transform stage 1 F5[0] stored
VST1.64 {d18,d19},[r2],r4 @ Row 6 of transform stage 1 F5[1] stored
VST1.64 {d20,d21},[r2]! @ Row 8 of transform stage 1 F7[0] stored
VST1.64 {d14,d15},[r2] @ Row 8 of transform stage 1 F7[1] stored
@ Transform stage 2 (for rows 1-4 of transform stage 1)
@ Transposing the 4 rows (F0, F1, F2, F3)
@ F0 = {q2,q12}, F1 = {q8,q1}, F2 = {q15,q14} and F3 = {q13,q11}
VTRN.32 q12,q1 @ Transposing first half of transform stage 1 (1a)
VTRN.32 q14,q11 @ Transposing first half of transform stage 1 (1b)
VSWP d25,d28 @ Transposing first half of transform stage 1 (2a)
VSWP d22,d3 @ Transposing first half of transform stage 1 (2b)
VTRN.32 q2,q8 @ Transposing first half of transform stage 1 (3a)
VTRN.32 q15,q13 @ Transposing first half of transform stage 1 (3b)
VSWP d5,d30 @ Transposing first half of transform stage 1 (4a)
VSWP d26,d17 @ Transposing first half of transform stage 1 (4b)
@ B0:q12, B1:q1, B2:q14, B3:q11, B4:q2, B5:q8, B6:q15 and B7:q13
@ Evaluating first step in Butterfly diagram
VADD.S32 q0,q12,q13 @ q0 = B0 + B7
VADD.S32 q5,q11,q2 @ q5 = B3 + B4
VADD.S32 q3,q1,q15 @ q3 = B1 + B6
VADD.S32 q4,q14,q8 @ q4 = B2 + B5
VSUB.S32 q7,q14,q8 @ q7 = B2 - B5
VSUB.S32 q8,q1,q15 @ q8 = B1 - B6
VSUB.S32 q6,q11,q2 @ q6 = B3 - B4
VSUB.S32 q9,q12,q13 @ q9 = B0 - B7
@ Calculating G0, G2, G4 and G6
MOV r4,#18
MOV r5,#50
VMOV d2,r4,r5 @ 32-bit aligned, d2[1] = 50, d2[0] = 18
VSUB.S32 q2,q0,q5 @ q2 = B0 - B3 - B4 + B7
MOV r4,#75
MOV r5,#89
VMOV d3,r4,r5 @ 32-bit aligned, d3[1] = 89, d3[0] = 75
VADD.S32 q10,q0,q5 @ q10 = B0 + B3 + B4 + B7
MOV r4,#36
MOV r5,#83
VMOV d0,r4,r5 @ 32-bit aligned, d0[1] = 83, d0[0] = 36
VSUB.S32 q11,q3,q4 @ q11 = B1 - B2 - B5 + B6
VADD.S32 q3,q3,q4 @ q3 = B1 + B2 + B5 + B6
VMUL.S32 q12,q2,d0[1] @ q12 = 83*(B0 - B3 - B4 + B7)
VMUL.S32 q2,q2,d0[0] @ q2 = 36*(B0 - B3 - B4 + B7)
VMUL.S32 q5,q9,d3[1] @ q5 = 89*(B0 - B7)
VADD.S32 q14,q10,q3 @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7
VMUL.S32 q4,q9,d3[0] @ q4 = 75*(B0 - B7)
VSUB.S32 q15,q10,q3 @ q15 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7
@ VSHL.S32 q14,q14,#6 ; q14 = G0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7)
@ VSHL.S32 q15,q15,#6 ; q15 = G4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7)
VMLA.S32 q12,q11,d0[0] @ q12 = G2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)
VRSHRN.I32 d28,q14,#5 @ Truncating last 11 bits in G0
VMLS.S32 q2,q11,d0[1] @ q2 = G6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)
VRSHRN.I32 d30,q15,#5 @ Truncating last 11 bits in G4
LDR r4,[sp,#80] @ r4 = dst_strd
LSL r4,r4,#2 @ r4 = 2*dst_strd*2
VMUL.S32 q3,q9,d2[1] @ q3 = 50*(B0 - B7)
VRSHRN.I32 d24,q12,#11 @ Truncating last 11 bits in G2
VMUL.S32 q9,q9,d2[0] @ q9 = 18*(B0 - B7)
VRSHRN.I32 d4,q2,#11 @ Truncating last 11 bits in G6
VMLA.S32 q5,q8,d3[0] @ q5 = 89*(B0 - B7) + 75*(B1 - B6)
VST1.64 d28,[r3],r4 @ First half-row of row 1 of transform stage 2 (G0) stored
VMLS.S32 q4,q8,d2[0] @ q4 = 75*(B0 - B7) - 18*(B1 - B6)
VMLS.S32 q3,q8,d3[1] @ q3 = 50*(B0 - B7) - 89*(B1 - B6)
VST1.64 d24,[r3],r4 @ First half-row of row 3 of transform stage 2 (G2) stored
VMLS.S32 q9,q8,d2[1] @ q9 = 18*(B0 - B7) - 50*(B1 - B6)
VMLA.S32 q5,q7,d2[1] @ q5 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5)
VST1.64 d30,[r3],r4 @ First half-row of row 5 of transform stage 2 (G4) stored
VMLS.S32 q4,q7,d3[1] @ q4 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5)
VMLA.S32 q3,q7,d2[0] @ q3 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5)
VST1.64 d4,[r3] @ First half-row of row 7 of transform stage 2 (G6) stored
VMLA.S32 q9,q7,d3[0] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5)
VMLA.S32 q5,q6,d2[0] @ q5 = G1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4)
VMLS.S32 q4,q6,d2[1] @ q4 = G3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4)
VMLA.S32 q3,q6,d3[0] @ q3 = G5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4)
VMLS.S32 q9,q6,d3[1] @ q9 = G7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4)
SUB r3,r3,r4,LSL #1
SUB r3,r3,r4,ASR #1 @ r3 = r3 - 5*dst_strd*2
@ r3 is moved from row 7 to row 2
VRSHRN.I32 d10,q5,#11 @ Truncating last 11 bits in G1
VRSHRN.I32 d8,q4,#11 @ Truncating last 11 bits in G3
VRSHRN.I32 d6,q3,#11 @ Truncating last 11 bits in G5
VST1.64 d10,[r3],r4 @ First half-row of row 2 of transform stage 2 (G1) stored
VRSHRN.I32 d18,q9,#11 @ Truncating last 11 bits in G7
VST1.64 d8,[r3],r4 @ First half-row of row 4 of transform stage 2 (G3) stored
VST1.64 d6,[r3],r4 @ First half-row of row 6 of transform stage 2 (G5) stored
VST1.64 d18,[r3]! @ First half-row of row 8 of transform stage 2 (G7) stored
@ Transform stage 2 (for rows 5-8 of transform stage 1)
@ Loading the 4 rows (F4, F5, F6, F7)
SUB r2,r2,#112 @ r2 jumps from row 8 to row 5 in temporary memory
VLD1.64 {d20,d21},[r2]! @ q10 = F4[0]
VLD1.64 {d22,d23},[r2]! @ q11 = F4[1]
VLD1.64 {d8,d9},[r2]! @ q4 = F5[0]
@ Transposing the 4 rows
@ F0 = {q11,q10}, F1 = {q5,q4}, F2 = {q3,q2} and F3 = {q13,q12}
VTRN.32 q10,q4 @ Transposing second half of transform stage 1 (1a)
VLD1.64 {d10,d11},[r2]! @ q5 = F5[1]
VLD1.64 {d4,d5},[r2]! @ q2 = F6[0]
VLD1.64 {d6,d7},[r2]! @ q3 = F6[1]
VLD1.64 {d24,d25},[r2]! @ q12 = F7[0]
VTRN.32 q2,q12 @ Transposing second half of transform stage 1 (1b)
VLD1.64 {d26,d27},[r2] @ q13 = F7[1]
VSWP d21,d4 @ Transposing second half of transform stage 1 (2a)
VSWP d24,d9 @ Transposing second half of transform stage 1 (2b)
VTRN.32 q11,q5 @ Transposing second half of transform stage 1 (3a)
VTRN.32 q3,q13 @ Transposing second half of transform stage 1 (3b)
VSWP d26,d11 @ Transposing second half of transform stage 1 (4b)
VSWP d23,d6 @ Transposing second half of transform stage 1 (4a)
@ B0:q10, B1:q4, B2:q2, B3:q12, B4:q11, B5:q5, B6:q3 and B7:q13
@ Evaluating first step in Butterfly diagram
VADD.S32 q0,q10,q13 @ q0 = B0 + B7
VADD.S32 q15,q12,q11 @ q15 = B3 + B4
VADD.S32 q1,q4,q3 @ q1 = B1 + B6
VADD.S32 q14,q2,q5 @ q14 = B2 + B5
VSUB.S32 q9,q10,q13 @ q9 = B0 - B7
VSUB.S32 q6,q12,q11 @ q6 = B3 - B4
VSUB.S32 q7,q2,q5 @ q7 = B2 - B5
VSUB.S32 q8,q4,q3 @ q8 = B1 - B6
@ Calculating H0, H2, H4 and H6
VADD.S32 q3,q1,q14 @ q3 = B1 + B2 + B5 + B6
VSUB.S32 q5,q1,q14 @ q5 = B1 - B2 - B5 + B6
MOV r4,#18
MOV r5,#50
VSUB.S32 q4,q0,q15 @ q4 = B0 - B3 - B4 + B7
VMOV d2,r4,r5 @ 32-bit aligned, d2[1] = 50, d2[0] = 18
MOV r4,#75
MOV r5,#89
VADD.S32 q2,q0,q15 @ q2 = B0 + B3 + B4 + B7
VMOV d3,r4,r5 @ 32-bit aligned, d3[1] = 89, d3[0] = 75
MOV r4,#36
MOV r5,#83
@ Calculating H1, H3, H5 and H7
VMUL.S32 q10,q9,d3[1] @ q10 = 89*(B0 - B7)
VMOV d0,r4,r5 @ 32-bit aligned, d0[1] = 83, d0[0] = 36
VMUL.S32 q13,q9,d3[0] @ q13 = 75*(B0 - B7)
VMUL.S32 q12,q4,d0[1] @ q12 = 83*(B0 - B3 - B4 + B7)
VADD.S32 q14,q2,q3 @ q14 = B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7
VMUL.S32 q4,q4,d0[0] @ q4 = 36*(B0 - B3 - B4 + B7)
VSUB.S32 q2,q2,q3 @ q2 = B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7
VMLA.S32 q12,q5,d0[0] @ q12 = H2 = 83*(B0 - B3 - B4 + B7) + 36*(B1 - B2 - B5 + B6)
@ VSHL.S32 q14,q14,#6 ; q14 = H0 = 64*(B0 + B1 + B2 + B3 + B4 + B5 + B6 + B7)
VMLS.S32 q4,q5,d0[1] @ q4 = H6 = 36*(B0 - B3 - B4 + B7) - 83*(B1 - B2 - B5 + B6)
@ VSHL.S32 q2,q15,#6 ; q2 = H4 = 64*(B0 - B1 - B2 + B3 + B4 - B5 - B6 + B7)
VMUL.S32 q11,q9,d2[1] @ q11 = 50*(B0 - B7)
VRSHRN.I32 d28,q14,#5 @ Truncating last 11 bits in H0
VMUL.S32 q9,q9,d2[0] @ q9 = 18*(B0 - B7)
VRSHRN.I32 d24,q12,#11 @ Truncating last 11 bits in H2
VMLA.S32 q10,q8,d3[0] @ q10 = 89*(B0 - B7) + 75*(B1 - B6)
VRSHRN.I32 d4,q2,#5 @ Truncating last 11 bits in H4
VMLS.S32 q13,q8,d2[0] @ q13 = 75*(B0 - B7) - 18*(B1 - B6)
VRSHRN.I32 d8,q4,#11 @ Truncating last 11 bits in H6
LDR r4,[sp,#80] @ r4 = dst_strd
LSL r4,r4,#2 @ r4 = 2*dst_strd*2
SUB r3,r3,r4,LSL #2
ADD r3,r3,r4,ASR #1 @ r3 = r3 - 7*dst_strd*2
@ r3 is moved from row 8 to row 1
VMLS.S32 q11,q8,d3[1] @ q11 = 50*(B0 - B7) - 89*(B1 - B6)
VST1.64 d28,[r3],r4 @ Second half-row of row 1 of transform stage 2 (H0) stored
VMLS.S32 q9,q8,d2[1] @ q9 = 18*(B0 - B7) - 50*(B1 - B6)
VMLA.S32 q10,q7,d2[1] @ q10 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5)
VST1.64 d24,[r3],r4 @ Second half-row of row 3 of transform stage 2 (H2) stored
VMLS.S32 q13,q7,d3[1] @ q13 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5)
VMLA.S32 q11,q7,d2[0] @ q11 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5)
VST1.64 d4,[r3],r4 @ Second half-row of row 5 of transform stage 2 (H4) stored
VMLA.S32 q9,q7,d3[0] @ q9 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5)
VMLA.S32 q10,q6,d2[0] @ q10 = H1 = 89*(B0 - B7) + 75*(B1 - B6) + 50*(B2 - B5) + 18*(B3 - B4)
VST1.64 d8,[r3] @ Second half-row of row 7 of transform stage 2 (H6) stored
VMLS.S32 q13,q6,d2[1] @ q13 = H3 = 75*(B0 - B7) - 18*(B1 - B6) - 89*(B2 - B5) - 50*(B3 - B4)
VMLA.S32 q11,q6,d3[0] @ q11 = H5 = 50*(B0 - B7) - 89*(B1 - B6) + 18*(B2 - B5) + 75*(B3 - B4)
VMLS.S32 q9,q6,d3[1] @ q9 = H7 = 18*(B0 - B7) - 50*(B1 - B6) + 75*(B2 - B5) - 89*(B3 - B4)
SUB r3,r3,r4,LSL #1
SUB r3,r3,r4,ASR #1 @ r3 = r3 - 5*dst_strd
@ r3 is moved from row 7 to row 2
VRSHRN.I32 d20,q10,#11 @ Truncating last 11 bits in H1
VRSHRN.I32 d26,q13,#11 @ Truncating last 11 bits in H3
VRSHRN.I32 d22,q11,#11 @ Truncating last 11 bits in H5
VST1.64 d20,[r3],r4 @ Second half-row of row 2 of transform stage 2 (H1) stored
VRSHRN.I32 d18,q9,#11 @ Truncating last 11 bits in H7
VST1.64 d26,[r3],r4 @ Second half-row of row 4 of transform stage 2 (H3) stored
VST1.64 d22,[r3],r4 @ Second half-row of row 6 of transform stage 2 (H5) stored
VST1.64 d18,[r3] @ Second half-row of row 8 of transform stage 2 (H7) stored
vpop {d8 - d15}
POP {r4,r5}
MOV pc,lr
@/**
@*/ *******************************************************************************
@*/
@*/@brief
@*/ This function performs residue calculation and forward transform on
@*/ input pixels
@*/
@*/@par Description:
@*/ Performs residue calculation by subtracting source and prediction and
@*/ followed by forward transform
@*/
@*/ @param[in] pu1_src
@*/ Input 16x16 pixels
@*/
@*/ @param[in] pu1_pred
@*/ Prediction data
@*/
@*/ @param[in] pi2_tmp
@*/ Temporary buffer of size 16x16
@*/
@*/ @param[out] pi2_dst
@*/ Output 16x16 coefficients
@*/
@*/ @param[in] src_strd
@*/ Input stride
@*/
@*/ @param[in] pred_strd
@*/ Prediction Stride
@*/
@*/ @param[in] dst_strd
@*/ Output Stride
@*/
@*/ @param[in] chr_plane
@*/ Chroma plane
@*/
@*/ @returns Void
@*/
@*/ @remarks
@*/ None
@*/
@*/*******************************************************************************
@*/
.extern g_ai2_ihevc_trans_16
.extern g_ai4_ihevc_trans_16
g_ai2_ihevc_trans_16_addr_1:
.long g_ai2_ihevc_trans_16 - ulbl1 - 8
g_ai2_ihevc_trans_16_addr_2:
.long g_ai2_ihevc_trans_16 - ulbl2 - 8
g_ai4_ihevc_trans_16_addr:
.long g_ai4_ihevc_trans_16 - ulbl3 - 8
.global ihevc_resi_trans_16x16_a9q
ihevc_resi_trans_16x16_a9q:
.equ TMP_STRIDE , 64 @16*4, Stride of tmp register
.equ SHIFT , 13 @shift = 13; // log2(iWidth) - 1 + g_uiBitIncrement
.equ RADD , 4096 @1 << (shift - 1);
.equ COFF_STD_2B , 32 @Stride for g_ai2_ihevc_trans_16 in bytes
.equ COFF_STD_W , 32 @Stride for g_ai4_ihevc_trans_16 in bytes
@;LOAD the fucntion
STMFD SP!,{r4-r12,LR} @stack store values of the arguments
vpush {d8 - d15}
SUB SP,SP,#32
LDR R4,[SP,#136] @get src_strd
LDR R5,[SP,#140] @get pred_strd
LDR R6,[SP,#144] @get dst_strd
LDR R14,[SP,#148] @get chroma_plane
MOV R8,#0 @Set loop counter
LDR R9,g_ai2_ihevc_trans_16_addr_1 @get 16 bit transform matrix
ulbl1:
ADD R9, R9, PC
@Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1] values of g_ai2_ihevc_trans_16
@and write to stack
MOV R12,#COFF_STD_2B
LSL R12,#2
VLD1.S32 D30[0],[R9],R12
VLD1.S32 D30[1],[R9],R12
VLD1.S32 D31[0],[R9],R12
VLD1.S32 D31[1],[R9],R12
VTRN.S32 D30,D31
VTRN.S16 D30,D31
VST1.S16 {d30,d31},[SP]
LDR R9,g_ai2_ihevc_trans_16_addr_2 @get back 16 bit transform matrix
ulbl2:
ADD R9, R9, PC
MOV R7,#TMP_STRIDE
VMOV.S32 Q14,#0
@R0 pu1_src
@R1 pu1_pred
@R2 pi4_tmp
@R3 pi2_dst
@R4 src_strd
@R5 pred_strd
@R6 dst_strd
@R7 tmp_dst Nx4 block stride
@R8 loop cntr
@R9 g_ai2_ihevc_trans_16
@R10 tmp_dst Nx4 block offset
@R11 tmp register
@R12 ------
@R14 chroma_plane
@q14 shift 32 bit
@q15 add 32 bit
CORE_LOOP_16X16_HORIZ:
CMP R14,#-1
BGT INTERLEAVED_LOAD_S1
VLD1.U8 {d0,d1},[R0],R4 @LOAD 1-16 src row 1
VLD1.U8 {d2,d3},[R1],R5 @LOAD 1-16 pred row 1
VLD1.U8 {d4,d5},[R0],R4 @LOAD 1-16 src row 2
VLD1.U8 {d6,d7},[R1],R5 @LOAD 1-16 pred row 2
B LOAD_DONE
INTERLEAVED_LOAD_S1:
CMP R14,#1
BEQ INTERLEAVED_LOAD_S2
VLD2.U8 {Q0,Q1},[R0],R4 @LOAD 1-16 src row 1
VLD2.U8 {Q1,Q2},[R1],R5 @LOAD 1-16 pred row 1
VLD2.U8 {Q2,Q3},[R0],R4 @LOAD 1-16 src row 2
VLD2.U8 {Q3,Q4},[R1],R5 @LOAD 1-16 pred row 2
B LOAD_DONE
INTERLEAVED_LOAD_S2:
VLD2.U8 {Q0,Q1},[R0],R4 @LOAD 1-16 src row 1
VSWP.U8 Q0,Q1
VLD2.U8 {Q1,Q2},[R1],R5 @LOAD 1-16 pred row 1
VSWP.U8 Q1,Q2
VLD2.U8 {Q2,Q3},[R0],R4 @LOAD 1-16 src row 2
VSWP.U8 Q2,Q3
VLD2.U8 {Q3,Q4},[R1],R5 @LOAD 1-16 pred row 2
VSWP.U8 Q3,Q4
LOAD_DONE:
VSUBL.U8 Q4,D0,D2 @Get residue 1-8 row 1
VSUBL.U8 Q5,D1,D3 @Get residue 9-16 row 1
VSUBL.U8 Q6,D4,D6 @Get residue 1-8 row 2
VSUBL.U8 Q7,D5,D7 @Get residue 9-16 row 2
@Get blk sads
VABDL.U8 Q15,D0,D2
VABAL.U8 Q15,D1,D3
VABAL.U8 Q15,D4,D6
VABAL.U8 Q15,D5,D7
VADDW.S16 Q14,Q14,D30
VADDW.S16 Q14,Q14,D31
VREV64.S16 Q5,Q5 @Rev row 1
VREV64.S16 Q7,Q7 @Rev row 2
VSWP D10,D11
VSWP D14,D15
VADD.S16 Q8 ,Q4,Q5 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-8 row 1
VSUB.S16 Q9 ,Q4,Q5 @o[k] = resi_tmp_1 - resi_tmp_2 k ->9-16 row 1
VADD.S16 Q10,Q6,Q7 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-8 row 2
VSUB.S16 Q11,Q6,Q7 @o[k] = resi_tmp_1 - resi_tmp_2 k ->9-16 row 2
VREV64.S16 D24,D17 @rev e[k] k-> 4-7 row 1
VREV64.S16 D25,D21 @rev e[k] k-> 4-7 row 2
VMOV.S16 D17,D20
@arrangement OF DATA
@Q8 A1 A2 A3 A4 B1 B2 B3 B4
@Q12 A8 A7 A6 A5 B8 B7 B6 B5
VADD.S16 Q13,Q8,Q12 @ee[k] = e[k] + e[7 - k] row 1 & 2
VSUB.S16 Q0,Q8,Q12 @eo[k] = e[k] - e[7 - k] row 1 & 2
@D26 R1ee[0] R1ee[1] R1ee[2] R1ee[3]
@D27 R2ee[0] R2ee[1] R2ee[2] R2ee[3]
VTRN.S32 D26,D27 @1-cycle stall before it?
@D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1]
@D27 R1ee[2] R1ee[3] R2ee[2] R2ee[3]
VREV32.16 D2,D27 @1-cycle stall before it?
@D26 R1ee[0] R1ee[1] R2ee[0] R2ee[1]
@D2 R1ee[3] R1ee[2] R2ee[3] R2ee[2]
VMOV.S16 D27,D26
VNEG.S16 D3,D2
@Q13 R1ee[0] R1ee[1] R2ee[0] R2ee[1] R1ee[0] R1ee[1] R2ee[0] R2ee[1]
@Q1 R1ee[3] R1ee[2] R2ee[3] R2ee[2] -R1ee[3] -R1ee[2] -R2ee[3] -R2ee[2]
@D8 : [0 0] [4 0] [8 0] [12 0]
@D9 : [0 1] [4 1] [8 1] [12 1]
VLD1.S16 {d8,d9},[SP] @[0 0] [4 0] [8 0] [12 0] [0 1] [4 1] [8 1] [12 1]
VADD.S16 Q1,Q13,Q1 @ 1-cycle stall before it?
@Q15 R1eee[0] R1eee[1] R2eee[0] R2eee[1] R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1]
@Q1 R1eee[0] R1eee[1] R2eee[0] R2eee[1]
@ R1eeo[0] R1eeo[1] R2eeo[0] R2eeo[1]
VTRN.S16 D2,D3 @2-cycle stall before it?
@Q1 R1eee[0] R1eeo[0] R2eee[0] R2eeo[0]
@ R1eee[1] R1eeo[1] R2eee[1] R2eeo[1]
VDUP.S32 D4,D2[0] @R1eee[0] R1eeo[0] R1eee[0] R1eeo[0] ;1-cycle stall?
VDUP.S32 D5,D2[1] @R2eee[0] R2eeo[0] R2eee[0] R2eeo[0]
VDUP.S32 D6,D3[0] @R1eee[1] R1eeo[1] R1eee[1] R1eeo[1]
VDUP.S32 D7,D3[1] @R2eee[1] R2eeo[1] R2eee[1] R2eeo[1]
@---------------Process EO--------------------
@ Early start to avoid stalls
MOV R12,#COFF_STD_2B @Get stride of coeffs
VMULL.S16 Q5,D4,D8 @ g_ai2_ihevc_trans_16 * R1eee[0] R1eeo[0] R1eee[0] R1eeo[0]
VMLAL.S16 Q5,D6,D9 @ + g_ai2_ihevc_trans_16 * R1eee[1] R1eeo[1] R1eee[1] R1eeo[1]
VMULL.S16 Q6,D5,D8 @ g_ai2_ihevc_trans_16 * R2eee[0] R2eeo[0] R2eee[0] R2eeo[0]
VMLAL.S16 Q6,D7,D9 @ + g_ai2_ihevc_trans_16 * R2eee[1] R2eeo[1] R2eee[1] R2eeo[1]
ADD R11,R9,R12,LSL #1 @Load address of g_ai2_ihevc_trans_16[2]
LSL R12,R12,#2
VLD1.S16 D26,[R11],R12 @LOAD g_ai2_ihevc_trans_16[2][0-4]]
VLD1.S16 D27,[R11],R12 @LOAD g_ai2_ihevc_trans_16[6][0-4]
VMULL.S16 Q1,D26,D0 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] R1
VMULL.S16 Q2,D26,D1 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4] R2
VZIP.S32 Q5,Q6 @3-cycle instruction
VMULL.S16 Q3,D27,D0 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4] R1
VLD1.S16 D26,[R11],R12 @LOAD g_ai2_ihevc_trans_16[10][0-4]
VMULL.S16 Q4,D27,D1 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4] R2
@These values must go to 0 4 8 12 colums hence we need stride *4
LSL R10,R7,#2
VLD1.S16 D27,[R11],R12 @LOAD g_ai2_ihevc_trans_16[14][0-4]
VST1.32 D10,[R2],R10
VMULL.S16 Q8,D27,D1 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R2
VST1.32 D11,[R2],R10
VMULL.S16 Q7,D27,D0 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4] R1
VST1.32 D12,[R2],R10
VMULL.S16 Q5,D26,D0 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R1
VST1.32 D13,[R2],R10
VMULL.S16 Q6,D26,D1 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4] R2
SUB R2,R2,R10,LSL #2
@transpose the 4x4 matrix row1
VTRN.32 Q1, Q3 @R1 transpose1 -- 2 cycles
@transpose the 4x4 matrix row2
VTRN.32 Q2,Q4 @R2 transpose1 -- 2 cycles
VTRN.32 Q5, Q7 @R1 transpose1 -- 2 cycles
VTRN.32 Q6,Q8 @R2 transpose1 -- 2 cycles
VSWP D10,D3 @R1 transpose2
VSWP D14,D7 @R1 transpose2
VSWP D12,D5 @R2 transpose2
VSWP D16,D9 @R2 transpose2
VADD.S32 Q5,Q5,Q1 @R1 add
VADD.S32 Q3,Q3,Q7 @R1 add
VADD.S32 Q2,Q2,Q4 @R2 add
VADD.S32 Q6,Q6,Q8 @R2 add
VADD.S32 Q5,Q5,Q3 @R1 add
VADD.S32 Q4,Q6,Q2 @R2 add
@-----------------------Processing O ----------------------------
@ Early start to avoid stalls
MOV R12,#COFF_STD_2B @Get coeffs stride
LSL R12,R12,#1
ADD R11,R9,#COFF_STD_2B @Get address of g_ai2_ihevc_trans_16[1]
VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[1][0-7] -- 2 cycles
VZIP.S32 Q5,Q4 @ 3 cycle instruction
VMULL.S16 Q6,D18,D4 @o[0][0-3]* R1
VMLAL.S16 Q6,D19,D5 @o[0][4-7]* R1 ; follows MULL instruction: Multiplier accumulator forwarding
@write to memory
@this should go to 2 6 10 14
LSL R10,R7,#2
ADD R2,R2,R7,LSL #1 @move to third row
VST1.32 D10,[R2],R10
VMULL.S16 Q7,D22,D4 @o[0][0-3]* R2
VST1.32 D11,[R2],R10
VMLAL.S16 Q7,D23,D5 @o[0][4-7]* R2
VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[3][0-7]
VST1.32 D8,[R2],R10
VMULL.S16 Q8,D18,D4 @o[1][0-3]* R1
VST1.32 D9,[R2],R10
VMLAL.S16 Q8,D19,D5 @o[1][4-7]* R1
SUB R2,R2,R10,LSL #2
SUB R2,R2,R7,LSL #1
@--------------------Done procrssing EO -------------------------
@ -----------------Processing O continues------------------------
VMULL.S16 Q10,D22,D4 @o[1][0-3]* R2
VMLAL.S16 Q10,D23,D5 @o[1][4-7]* R2
VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[5][0-7]
VLD1.S16 {d6,d7},[R11],R12 @g_ai2_ihevc_trans_16[7][0-7]
VMULL.S16 Q12,D18,D4 @o[2][0-3]* R1
VMLAL.S16 Q12,D19,D5 @o[2][4-7]* R1
VMULL.S16 Q0,D18,D6 @o[3][0-3]* R1
VMLAL.S16 Q0,D19,D7 @o[3][4-7]* R1
VMULL.S16 Q13,D22,D4 @o[2][0-3]* R2
VMLAL.S16 Q13,D23,D5 @o[2][4-7]* R2
VMULL.S16 Q1,D22,D6 @o[3][0-3]* R2
VMLAL.S16 Q1,D23,D7 @o[3][4-7]* R2
@transpose the 4x4 matrix R1
VTRN.32 Q6, Q8 @ 2-cycle instruction
VTRN.32 Q12,Q0 @ 2-cycle instruction
@transpose the 4x4 matrix R2
VTRN.32 Q7,Q10 @ 2-cycle instruction
VTRN.32 Q13,Q1 @ 2-cycle instruction
VSWP D24,D13
VSWP D0, D17
VSWP D26,D15
VSWP D2,D21
VADD.S32 Q8 ,Q8 ,Q6
VADD.S32 Q12,Q12,Q0
VADD.S32 Q10,Q10,Q7
VADD.S32 Q13,Q13,Q1
VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[9][0-7]
VADD.S32 Q12 ,Q12 ,Q8
VADD.S32 Q13,Q13,Q10
VMULL.S16 Q3,D18,D4 @o[4][0-3]* R1
VMLAL.S16 Q3,D19,D5 @o[4][4-7]* R1
VZIP.S32 Q12,Q13
VMULL.S16 Q4,D22,D4 @o[0][0-3]* R2
VMLAL.S16 Q4,D23,D5 @o[0][4-7]* R2
@write to memory
@this should go to 1 3 5 7
ADD R2,R2,R7
LSL R7,R7,#1
VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[11][0-7]
VST1.32 D24,[R2],R7
VMULL.S16 Q5,D18,D4 @o[5][0-3]* R1
VST1.32 D25,[R2],R7
VMLAL.S16 Q5,D19,D5 @o[5][4-7]* R1
VST1.32 D26,[R2],R7
VMULL.S16 Q6,D22,D4 @o[0][0-3]* R2
VST1.32 D27,[R2],R7
VMLAL.S16 Q6,D23,D5 @o[0][4-7]* R2
VLD1.S16 {d4,d5},[R11],R12 @g_ai2_ihevc_trans_16[13][0-7]
VLD1.S16 {d2,d3},[R11],R12 @g_ai2_ihevc_trans_16[15][0-7]
VMULL.S16 Q7,D18,D4 @o[6][0-3]* R1
VMLAL.S16 Q7,D19,D5 @o[6][4-7]* R1
VMULL.S16 Q10,D18,D2 @o[7][0-3]* R1
VMLAL.S16 Q10,D19,D3 @o[7][4-7]* R1
VMULL.S16 Q8,D22,D4 @o[0][0-3]* R2
VMLAL.S16 Q8,D23,D5 @o[0][4-7]* R2
VMULL.S16 Q12,D22,D2 @o[0][0-3]* R2
VMLAL.S16 Q12,D23,D3 @o[0][4-7]* R2
@transpose the 4x4 matrix R1
VTRN.32 Q3 ,Q5 @ 2-cycle instruction
VTRN.32 Q7 ,Q10 @ transpose step 2 R1 , 2-cycle instruction
@transpose the 4x4 matrix R2
VTRN.32 Q4 ,Q6 @ 2-cycle instruction
VTRN.32 Q8 ,Q12 @ transpose step 2 R2 , 2-cycle instruction
VSWP D14,D7 @ transpose step 3, R1
VSWP D20,D11 @ transpose step 4, R1
VSWP D16,D9 @ transpose step 3, R2
VSWP D24,D13 @ transpose step 4, R2
VADD.S32 Q5 ,Q5 ,Q3
VADD.S32 Q10,Q10,Q7
VADD.S32 Q6 ,Q6 ,Q4
VADD.S32 Q12,Q12,Q8
VADD.S32 Q10,Q10,Q5
VADD.S32 Q12,Q12,Q6
@ 2-cycle stall
VZIP.S32 Q10,Q12 @ 3-cycle instruction
@ 2-cycle stall
@this should go to 9 11 13 15
VST1.32 D20,[R2],R7
VST1.32 D21,[R2],R7
VST1.32 D24,[R2],R7
VST1.32 D25,[R2],R7
SUB R2,R2,R7,LSL #3
LSR R7,R7,#1
SUB R2,R2,R7
ADD R2,R2,#8 @MOVE TO NEXT to next COLUMN - pi4_tmp
ADD R8,R8,#2 @increment loop cntr
CMP R8,#16 @check lllop cntr
BNE CORE_LOOP_16X16_HORIZ @jump acc
@*****************Vertical transform************************************
@Initialization for vert transform
@pi4_tmp will be the new src
@tmp stride will be new src stride
@dst will be new pi4_tmp
@dst stride will be new tmp stride
@trans table will be of 32 bit
LDR R9,g_ai4_ihevc_trans_16_addr @get 32 bit transform matrix
ulbl3:
ADD R9, R9, PC
SUB R0,R2,#64 @set tmp as src [-32 to move back to orgin]
MOV R2,R3 @set dst as tmp
MOV R4,#TMP_STRIDE @set tmp stride as src stride
LSL R7,R6,#1 @Set dst stride as tmp stride
SUB R4,#48 @Adjust stride 3 previous loads
@Block SAD
VADD.S32 D28,D28,D29
VPADD.S32 D28,D28,D29
VMOV.S32 R3,D28[0]
@ SAD calculation ends -- final value in R3.
@Read [0 0] [4 0] [8 0] [12 0],[0 1] [4 1] [8 1] [12 1]
@values of g_ai4_ihevc_trans_16 and write to stack
MOV R12,#COFF_STD_W
LSL R12,R12,#2
VLD1.S32 D28,[R9],R12
VLD1.S32 D29,[R9],R12
VLD1.S32 D30,[R9],R12
VLD1.S32 D31,[R9],R12
SUB R9,R9,R12,LSL #2
VREV64.32 Q15,Q15
VTRN.S32 Q14,Q15
VST1.S32 {Q14-Q15},[SP]
VMOV.U32 Q14,#RADD @get the round factor to q14
VMOV.U32 Q15,#SHIFT @Get the shift to neon
MOV R8,#0 @INIT LOOP
CORE_LOOP_16X16_VERT:
VLD1.S32 {D0,D1},[R0]! @LOAD 1-4 src R1
VLD1.S32 {D2,D3},[R0]! @LOAD 5-8 pred R1
VLD1.S32 {D4,D5},[R0]! @LOAD 9-12 src R1
VLD1.S32 {D6,D7},[R0],R4 @LOAD 12-16 pred R1
VLD1.S32 {D8,D9},[R0]! @LOAD 1-4 src R2
VLD1.S32 {D10,D11},[R0]! @LOAD 5-8 pred R2
VLD1.S32 {D12,D13},[R0]! @LOAD 9-12 src R2
VLD1.S32 {D14,D15},[R0],R4 @LOAD 12-16 pred R2
VREV64.S32 Q2,Q2 @Rev 9-12 R1
VREV64.S32 Q3,Q3 @Rev 12-16 R1
VREV64.S32 Q6,Q6 @Rev 9-12 R2
VREV64.S32 Q7,Q7 @Rev 12-16 R2
VSWP D6,D7
VSWP D4,D5
VADD.S32 Q8 ,Q0,Q3 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-4 R1
VSWP D12,D13 @ dual issued with prev. instruction
VADD.S32 Q9 ,Q1,Q2 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 5-8 R1
VSWP D14,D15 @ dual issued with prev. instruction
VSUB.S32 Q10,Q0,Q3 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 1-4 R1
VSUB.S32 Q11,Q1,Q2 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 5-8 R1
VADD.S32 Q12,Q4,Q7 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 1-4 R2
VREV64.S32 Q9 ,Q9 @rev e[k] k-> 4-7 R1, dual issued with prev. instruction
VADD.S32 Q13,Q5,Q6 @e[k] = resi_tmp_1 + resi_tmp_2 k -> 5-8 R2
VSUB.S32 Q0 ,Q4,Q7 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 1-4 R2
VSWP D18,D19 @ dual issued with prev. instruction
VSUB.S32 Q1 ,Q5,Q6 @o[k] = resi_tmp_1 - resi_tmp_2 k -> 5-8 R2
VREV64.S32 Q13,Q13 @rev e[k] k-> 4-7 R2, dual issued with prev. instruction
VADD.S32 Q2,Q8,Q9 @ee[k] = e[k] + e[7 - k] row R1
VSUB.S32 Q3,Q8,Q9 @eo[k] = e[k] - e[7 - k] row R1
VSWP D26,D27
VADD.S32 Q4,Q12,Q13 @ee[k] = e[k] + e[7 - k] row R2
VSUB.S32 Q5,Q12,Q13 @eo[k] = e[k] - e[7 - k] row R2
VREV64.S32 D5,D5 @rev ee[k] 4-7 R1, dual issued with prev. instruction
VADD.S32 D12,D4,D5 @eee[0] eee[1] R1
VSUB.S32 D13,D4,D5 @eeo[0] eeo[1] R1
VREV64.S32 D9,D9 @rev ee[k] 4-7 R2, dual issued with prev. instruction
VADD.S32 D14,D8,D9 @eee[0] eee[1] R2
VSUB.S32 D15,D8,D9 @eeo[0] eeo[1] R2
VLD1.S32 {Q12,Q13},[SP] @Load g_ai2_ihevc_trans_16[xx]-> Q12 : [0 0] [8 0] [4 0] [12 0] Q13 : [0 1] [8 1] [4 1] [12 1]
VREV64.S32 Q8,Q6 @Q6 : eee[0] eee[1] eeo[0] eeo[1] R1 -> ;Q8 : eee[1] eee[0] eeo[1] eeo[0] R1
VREV64.S32 Q9,Q7 @Q7 : eee[0] eee[1] eeo[0] eeo[1] R2 -> ;Q9 : eee[1] eee[0] eeo[1] eeo[0] R2
VMUL.S32 Q4,Q6,Q12 @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1] R1
VMLA.S32 Q4,Q8,Q13 @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R1
VMUL.S32 Q6,Q7,Q12 @g_ai2_ihevc_trans_16 * eee[0] eee[1] eeo[0] eeo[1] R2
VMLA.S32 Q6,Q9,Q13 @g_ai2_ihevc_trans_16 * eee[1] eee[0] eeo[1] eeo[0] R2
@Q3 :R1E00 R1E01 R1E02 R1E03
@Q5 :R2E00 R2E01 R2E02 R2E03
VSWP D7,D10 @ dual issued with prev. instruction
@Q3 :R1E00 R1E01 R2E00 R2E01
@Q5 :R1E02 R1E03 R2E02 R2E03
VSWP D7,D11
@Q3 :R1E00 R1E01 R2E02 R2E03
@Q5 :R1E02 R1E03 R2E00 R2E01
MOV R12,#COFF_STD_W
ADD R11,R9,R12,LSL #1 @Get to the 2nd row of src
LSL R12,R12,#2
VLD1.S32 {D14,D15},[R11],R12 @LOAD g_ai2_ihevc_trans_16[2][0-4] -> 2G0 2G1 2G2 2G3, 2-cycle instr.
VADD.S32 Q4,Q4,Q14 @ROUND R1
VMUL.S32 Q12,Q3,Q7 @2G0 2G1 2G2 2G3 * R1E00 R1E01 R2E02 R2E03, 4-cycle instruction
VSWP D14,D15 @2G0 2G1 2G2 2G3 -> 2G2 2G3 2G0 2G1, dual issued with prev. instruction
VADD.S32 Q6,Q6,Q14 @ROUND R2
VSHRN.S32 D8,Q4,#SHIFT @NARROW R1
VLD1.S32 {D16,D17},[R11],R12 @LOAD g_ai2_ihevc_trans_16[6][0-4]
VSHRN.S32 D9,Q6,#SHIFT @NARROW R2, dual issued in 2nd cycle
VMUL.S32 Q2,Q3,Q8 @g_ai2_ihevc_trans_16[6][0-4] * eo[0-4], 4-cycle instruction
VSWP D16,D17 @dual issued with prev. instr.
VZIP.S16 D8,D9 @INTERLEAVE R1 R2 R1 R2 R1 R2 to write
VMLA.S32 Q12,Q5,Q7 @2G2 2G3 2G0 2G1 * R1E02 R1E03 R2E00 R2E01, 4-cycle instruction
@WRITE INTO MEM the values or wait to be shuffled
@These values must go to 0 4 8 12 colums
LSL R10,R7,#2
VST1.S32 D8[0],[R2],R10
VST1.S32 D9[0],[R2],R10
VST1.S32 D8[1],[R2],R10
VPADD.S32 D18,D24,D25 @D18[0] -> 2G0*R1E00+2G1*R1E01 2G2*R2E02+2G3*R2E03
@D18[1] -> 2G2*R1E02+2G3*R1E03 2G0*R2E00+*2G1R2E01
VST1.S32 D9[1],[R2],R10
VMLA.S32 Q2,Q5,Q8 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
LSL R10,R10,#2
SUB R2,R2,R10
VLD1.S32 {D14,D15},[R11],R12 @LOAD g_ai2_ihevc_trans_16[10][0-4]
VMUL.S32 Q6,Q3,Q7 @g_ai2_ihevc_trans_16[10][0-4] * eo[0-4]
VSWP D14,D15 @ dual issued with prev. instruction
VPADD.S32 D19,D4,D5
VLD1.S32 {D16,D17},[R11],R12 @LOAD g_ai2_ihevc_trans_16[14][0-4]
VMUL.S32 Q2,Q3,Q8 @g_ai2_ihevc_trans_16[14][0-4] * eo[0-4]
VSWP D16,D17
VMLA.S32 Q6,Q5,Q7 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
VADD.S32 Q9,Q9,Q14 @Round by RADD R1
VMLA.S32 Q2,Q5,Q8 @g_ai2_ihevc_trans_16[2][0-4] * eo[0-4]
VSHRN.S32 D8,Q9,#SHIFT @Shift by SHIFT
VPADD.S32 D24,D12,D13
@---------------Processing O, Row 1 and Row 2--------------------------------------
@ Early start to avoid stalls
MOV R12,#COFF_STD_W
ADD R11,R9,R12 @Get 1ST row
LSL R12,R12,#1
LSL R10,R7,#2
ADD R2,R2,R7,LSL #1 @move to third row
@this should go to 2 6 10 14
VST1.S32 D8[0],[R2],R10
VST1.S32 D8[1],[R2],R10
VPADD.S32 D25,D4,D5 @ dual issued with prev. instruction in 2nd cycle
VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[1][0-7]
VADD.S32 Q12,Q12,Q14 @Round by RADD R2, dual issued with prev. instruction in 2nd cycle
VMUL.S32 Q6,Q2,Q0 @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R2
VMLA.S32 Q6,Q3,Q1 @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R2
VSHRN.S32 D9,Q12,#SHIFT @Shift by SHIFT
VMUL.S32 Q2,Q2,Q10 @g_ai2_ihevc_trans_16[1][0-3]*o[0][0-3] R1
VMLA.S32 Q2,Q3,Q11 @g_ai2_ihevc_trans_16[1][4-7]*o[0][4-7] R1
VADD.S32 D11,D12,D13 @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R2, dual issued with prev. instr.
VST1.S32 D9[0],[R2],R10
VST1.S32 D9[1],[R2],R10
VADD.S32 D10,D4,D5 @g_ai2_ihevc_trans_16[1][k]*o[0][k]+g_ai2_ihevc_trans_16[0][7-k]*o[0][7-k] R1, dual issued with prev. instr.
LSL R10,R10,#2 @go back to orgin
SUB R2,R2,R10
SUB R2,R2,R7,LSL #1
VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[3][0-7]
VMUL.S32 Q7,Q2,Q10 @o[0][0-3]
VMLA.S32 Q7,Q3,Q11 @o[0][4-7]
VMUL.S32 Q8,Q2,Q0 @o[0][0-3]
VMLA.S32 Q8,Q3,Q1 @o[0][4-7]
VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[5][0-7]
VADD.S32 D18,D14,D15
VMUL.S32 Q12,Q2,Q10 @o[0][0-3]
VMLA.S32 Q12,Q3,Q11 @o[0][4-7]
VADD.S32 D19,D16,D17
VMUL.S32 Q4,Q2,Q0
VMLA.S32 Q4,Q3,Q1
VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[7][0-7]
VADD.S32 D26,D24,D25 @ dual issued with prev. instr.
VMUL.S32 Q6,Q2,Q10 @o[0][0-3]
VMLA.S32 Q6,Q3,Q11 @o[0][4-7]
VADD.S32 D27,D8,D9
VMUL.S32 Q4,Q2,Q0
VMLA.S32 Q4,Q3,Q1
VADD.S32 D12,D12,D13
@Q5 Q9 Q13 Q6
VPADD.S32 D14,D10,D11
VPADD.S32 D15,D18,D19
VPADD.S32 D16,D26,D27
VADD.S32 D13,D8,D9
VADD.S32 Q9,Q7,Q14
VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[0][0-7]
VPADD.S32 D17,D12,D13 @ dual issued with prev. instr. in 2nd cycle
VMUL.S32 Q4,Q2,Q10 @o[0][0-3]
VMLA.S32 Q4,Q3,Q11 @o[0][4-7]
VADD.S32 Q12,Q8,Q14
VMUL.S32 Q6,Q2,Q0 @o[0][0-3]
VMLA.S32 Q6,Q3,Q1 @o[0][4-7]
VSHRN.S32 D26,Q9,#SHIFT
VSHRN.S32 D27,Q12,#SHIFT
VADD.S32 D10,D8,D9
@write to memory this should go to 1 3 5 7
ADD R2,R2,R7
LSL R7,R7,#1
VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[1][0-7]
VADD.S32 D11,D12,D13 @ dual issued with prev. instr.
VST1.S32 D26[0],[R2],R7
VMUL.S32 Q7,Q2,Q10 @o[0][0-3]
VMLA.S32 Q7,Q3,Q11 @o[0][4-7]
VST1.S32 D26[1],[R2],R7
VMUL.S32 Q8,Q2,Q0 @o[0][0-3]
VMLA.S32 Q8,Q3,Q1 @o[0][4-7]
VST1.S32 D27[0],[R2],R7
VADD.S32 D18,D14,D15
VST1.S32 D27[1],[R2],R7
VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[2][0-7]
VADD.S32 D19,D16,D17 @ dual issued with prev. instr.
VMUL.S32 Q12,Q2,Q10 @o[0][0-3]
VMLA.S32 Q12,Q3,Q11 @o[0][4-7]
VMUL.S32 Q4,Q2,Q0
VMLA.S32 Q4,Q3,Q1
VLD1.S32 {Q2,Q3},[R11],R12 @g_ai2_ihevc_trans_16[3][0-7]
VADD.S32 D26,D24,D25
VMUL.S32 Q6,Q2,Q10 @o[0][0-3]
VMLA.S32 Q6,Q3,Q11 @o[0][4-7]
VADD.S32 D27,D8,D9
VMUL.S32 Q4,Q2,Q0
VMLA.S32 Q4,Q3,Q1
VADD.S32 D12,D12,D13
@Q5 Q9 Q13 Q6
VPADD.S32 D14,D10,D11
VPADD.S32 D15,D18,D19
VPADD.S32 D16,D26,D27
VADD.S32 D13,D8,D9
VADD.S32 Q9,Q7,Q14
@ 1- cycle stall?
VPADD.S32 D17,D12,D13
VSHRN.S32 D22,Q9,#SHIFT
VADD.S32 Q10,Q8,Q14
@ 2-cycle stall?
VSHRN.S32 D23,Q10,#SHIFT
@this should go to 9 11 13 15
@LSL R11,R7,#1
VST1.S32 D22[0],[R2],R7
VST1.S32 D22[1],[R2],R7
VST1.S32 D23[0],[R2],R7
VST1.S32 D23[1],[R2],R7
SUB R2,R2,R7,LSL #3
LSR R7,R7,#1
SUB R2,R2,R7
ADD R2,R2,#4 @MOVE TO NEXT to next COLUMN
ADD R8,R8,#2 @increment loop cntr by 2 since we process loop as 2 cols
CMP R8,#16 @check loop cntr
BNE CORE_LOOP_16X16_VERT @jump acc
MOV R0,R3
ADD SP,SP,#32
vpop {d8 - d15}
LDMFD sp!,{r4-r12,PC} @stack store values of the arguments