You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
306 lines
9.8 KiB
306 lines
9.8 KiB
///******************************************************************************
|
|
// *
|
|
// * Copyright (C) 2018 The Android Open Source Project
|
|
// *
|
|
// * Licensed under the Apache License, Version 2.0 (the "License");
|
|
// * you may not use this file except in compliance with the License.
|
|
// * You may obtain a copy of the License at:
|
|
// *
|
|
// * http://www.apache.org/licenses/LICENSE-2.0
|
|
// *
|
|
// * Unless required by applicable law or agreed to in writing, software
|
|
// * distributed under the License is distributed on an "AS IS" BASIS,
|
|
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// * See the License for the specific language governing permissions and
|
|
// * limitations under the License.
|
|
// *
|
|
// *****************************************************************************
|
|
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
|
|
|
|
.macro push_v_regs
|
|
stp q8, q9, [sp, #-32]!
|
|
stp q10, q11, [sp, #-32]!
|
|
stp q12, q13, [sp, #-32]!
|
|
stp q14, q15, [sp, #-32]!
|
|
stp X8, X9, [sp, #-16]!
|
|
stp X10, X11, [sp, #-16]!
|
|
stp X12, X13, [sp, #-16]!
|
|
stp X14, X15, [sp, #-16]!
|
|
stp X16, X17, [sp, #-16]!
|
|
stp X29, X30, [sp, #-16]!
|
|
.endm
|
|
.macro pop_v_regs
|
|
ldp X29, X30, [sp], #16
|
|
ldp X16, X17, [sp], #16
|
|
ldp X14, X15, [sp], #16
|
|
ldp X12, X13, [sp], #16
|
|
ldp X10, X11, [sp], #16
|
|
ldp X8, X9, [sp], #16
|
|
ldp q14, q15, [sp], #32
|
|
ldp q12, q13, [sp], #32
|
|
ldp q10, q11, [sp], #32
|
|
ldp q8, q9, [sp], #32
|
|
.endm
|
|
.text
|
|
.global ixheaacd_over_lap_add2_armv8
|
|
|
|
|
|
ixheaacd_over_lap_add2_armv8:
|
|
push_v_regs
|
|
MOV X8, X5
|
|
SUB X12, X5, #1
|
|
LSL X9, X5, #2
|
|
LSL X12, X12, #2
|
|
ADD X10, X0, X9
|
|
ADD X7, X1, X12
|
|
ADD X4, X4, #1
|
|
LD2 {V0.4H, V1.4H}, [X10], #16
|
|
LSL X11, X6, #2
|
|
SUB X7, X7, #12
|
|
SUB X4, X4, #16
|
|
MOV X12, #-16
|
|
MOV X13, #1
|
|
ADD X14, X4, #1
|
|
NEG X14, X14
|
|
DUP V21.4S, W4
|
|
LD2 {V6.4H, V7.4H}, [X7], X12
|
|
LSL X4, X13, X14
|
|
REV64 V4.4H, V6.4H
|
|
DUP V20.4S, W4
|
|
REV64 V5.4H, V7.4H
|
|
MOV X4, X3
|
|
|
|
MOV X9, X2
|
|
LD2 {V2.4H, V3.4H}, [X3], #16
|
|
|
|
UMULL V23.4S, V0.4H, V2.4H
|
|
UMLSL V23.4S, V4.4H, V3.4H
|
|
LD2 {V8.4H, V9.4H}, [X10], #16
|
|
SSHR V23.4S, V23.4S, #16
|
|
LD2 {V10.4H, V11.4H}, [X3], #16
|
|
SMLAL V23.4S, V1.4H, V2.4H
|
|
SMLSL V23.4S, V5.4H, V3.4H
|
|
LD2 {V14.4H, V15.4H}, [X7], X12
|
|
REV64 V12.4H, V14.4H
|
|
REV64 V13.4H, V15.4H
|
|
SQADD V22.4S, V23.4S, V20.4S
|
|
SSHL V22.4S, V22.4S, V21.4S
|
|
MOV V24.16B, V22.16B
|
|
SUB X8, X8, #8
|
|
|
|
LOOP_1:
|
|
|
|
LD2 {V0.4H, V1.4H}, [X10], #16
|
|
UMULL V19.4S, V8.4H, V10.4H
|
|
LD2 {V2.4H, V3.4H}, [X3], #16
|
|
UMLSL V19.4S, V12.4H, V11.4H
|
|
LD2 {V6.4H, V7.4H}, [X7], X12
|
|
UMULL V23.4S, V0.4H, V2.4H
|
|
REV64 V4.4H, V6.4H
|
|
UMLSL V23.4S, V4.4H, V3.4H
|
|
REV64 V5.4H, V7.4H
|
|
SSHR V19.4S, V19.4S, #16
|
|
ST1 {V24.S}[0], [X2], X11
|
|
SMLAL V19.4S, V9.4H, V10.4H
|
|
ST1 {V24.S}[1], [X2], X11
|
|
SSHR V23.4S, V23.4S, #16
|
|
ST1 {V24.S}[2], [X2], X11
|
|
SMLAL V23.4S, V1.4H, V2.4H
|
|
|
|
ST1 {V24.S}[3], [X2], X11
|
|
SMLSL V19.4S, V13.4H, V11.4H
|
|
SMLSL V23.4S, V5.4H, V3.4H
|
|
|
|
LD2 {V8.4H, V9.4H}, [X10], #16
|
|
LD2 {V10.4H, V11.4H}, [X3], #16
|
|
|
|
|
|
LD2 {V14.4H, V15.4H}, [X7], X12
|
|
SQADD V18.4S, V19.4S, V20.4S
|
|
REV64 V12.4H, V14.4H
|
|
REV64 V13.4H, V15.4H
|
|
SQADD V22.4S, V23.4S, V20.4S
|
|
SSHL V18.4S, V18.4S, V21.4S
|
|
MOV V16.16B, V18.16B
|
|
ST1 {V16.S}[0], [X2], X11
|
|
SSHL V22.4S, V22.4S, V21.4S
|
|
|
|
|
|
MOV V24.16B, V22.16B
|
|
SUBS X8, X8, #8
|
|
|
|
ST1 {V16.S}[1], [X2], X11
|
|
ST1 {V16.S}[2], [X2], X11
|
|
ST1 {V16.S}[3], [X2], X11
|
|
|
|
|
|
BGT LOOP_1
|
|
|
|
|
|
ST1 {V24.S}[0], [X2], X11
|
|
UMULL V19.4S, V8.4H, V10.4H
|
|
UMLSL V19.4S, V12.4H, V11.4H
|
|
ST1 {V24.S}[1], [X2], X11
|
|
ST1 {V24.S}[2], [X2], X11
|
|
SSHR V19.4S, V19.4S, #16
|
|
ST1 {V24.S}[3], [X2], X11
|
|
SMLAL V19.4S, V9.4H, V10.4H
|
|
SMLSL V19.4S, V13.4H, V11.4H
|
|
MOV X12, #12
|
|
MOV V30.S[0], W5
|
|
MOV V31.S[0], W6
|
|
SMULL V29.4S, V30.4H, V31.4H
|
|
MOV W7, V29.S[0]
|
|
|
|
LSL W10, W5, #1
|
|
SQADD V18.4S, V19.4S, V20.4S
|
|
SSHL V18.4S, V18.4S, V21.4S
|
|
MOV V16.16B, V18.16B
|
|
|
|
ST1 {V16.S}[0], [X2], X11
|
|
LSL X7, X7, #2
|
|
|
|
ST1 {V16.S}[1], [X2], X11
|
|
ADD X7, X7, X9
|
|
|
|
ST1 {V16.S}[2], [X2], X11
|
|
ST1 {V16.S}[3], [X2], X11
|
|
|
|
SUB X11, X10, #1
|
|
LSL X10, X11, #2
|
|
ADD X10, X0, X10
|
|
LSL X11, X11, #1
|
|
SUB X10, X10, X12
|
|
LSL X8, X6, #2
|
|
MOV X12, #-16
|
|
ADD X11, X11, X4
|
|
|
|
LD1 {V6.4S}, [X10], X12
|
|
SUB X11, X11, #14
|
|
|
|
|
|
REV64 V0.4S, V6.4S
|
|
SQNEG V0.4S, V0.4S
|
|
|
|
|
|
UZP1 V1.8H, V0.8H, V0.8H
|
|
UZP2 V0.8H, V0.8H, V0.8H
|
|
REV64 V1.4S, V1.4S
|
|
REV64 V0.4S, V0.4S
|
|
LD2 {V2.4H, V3.4H}, [X11], X12
|
|
REV64 V2.4H, V2.4H
|
|
REV64 V3.4H, V3.4H
|
|
|
|
LD2 {V4.4H, V5.4H}, [X1], #16
|
|
|
|
UMULL V23.4S, V1.4H, V3.4H
|
|
UMLSL V23.4S, V4.4H, V2.4H
|
|
SSHR V23.4S, V23.4S, #16
|
|
SMLAL V23.4S, V0.4H, V3.4H
|
|
SMLSL V23.4S, V5.4H, V2.4H
|
|
SQADD V22.4S, V23.4S, V20.4S
|
|
SSHL V22.4S, V22.4S, V21.4S
|
|
MOV V24.16B, V22.16B
|
|
|
|
|
|
LD1 {V14.4S}, [X10], X12
|
|
UMULL V23.4S, V1.4H, V3.4H
|
|
UMLSL V23.4S, V4.4H, V2.4H
|
|
REV64 V8.4S, V14.4S
|
|
SQNEG V8.4S, V8.4S
|
|
LD2 {V10.4H, V11.4H}, [X11], X12
|
|
SSHR V23.4S, V23.4S, #16
|
|
LD2 {V12.4H, V13.4H}, [X1], #16
|
|
SMLAL V23.4S, V0.4H, V3.4H
|
|
SMLSL V23.4S, V5.4H, V2.4H
|
|
UZP1 V9.8H, V8.8H, V8.8H
|
|
UZP2 V8.8H, V8.8H, V8.8H
|
|
rev64 v9.4s, v9.4s
|
|
rev64 v8.4s, v8.4s
|
|
REV64 V10.4H, V10.4H
|
|
REV64 V11.4H, V11.4H
|
|
SQADD V22.4S, V23.4S, V20.4S
|
|
SUB X5, X5, #8
|
|
SSHL V22.4S, V22.4S, V21.4S
|
|
MOV V24.16B, V22.16B
|
|
|
|
|
|
LOOP_2:
|
|
|
|
|
|
LD1 {V6.4S}, [X10], X12
|
|
UMULL V19.4S, V9.4H, V11.4H
|
|
REV64 V0.4S, V6.4S
|
|
SQNEG V0.4S, V0.4S
|
|
UZP1 V1.8H, V0.8H, V0.8H
|
|
UZP2 V0.8H, V0.8H, V0.8H
|
|
REV64 V1.4S, V1.4S
|
|
REV64 V0.4S, V0.4S
|
|
LD2 {V2.4H, V3.4H}, [X11], X12
|
|
REV64 V2.8H, V2.8H
|
|
REV64 V3.8H, V3.8H
|
|
|
|
LD2 {V4.4H, V5.4H}, [X1], #16
|
|
UMLSL V19.4S, V12.4H, V10.4H
|
|
ST1 {V24.S}[0], [X7], X8
|
|
UMULL V23.4S, V1.4H, V3.4H
|
|
ST1 {V24.S}[1], [X7], X8
|
|
SSHR V19.4S, V19.4S, #16
|
|
ST1 {V24.S}[2], [X7], X8
|
|
UMLSL V23.4S, V4.4H, V2.4H
|
|
ST1 {V24.S}[3], [X7], X8
|
|
SMLAL V19.4S, V8.4H, V11.4H
|
|
LD1 {V14.4S}, [X10], X12
|
|
SSHR V23.4S, V23.4S, #16
|
|
SMLSL V19.4S, V13.4H, V10.4H
|
|
LD2 {V10.4H, V11.4H}, [X11], X12
|
|
SMLAL V23.4S, V0.4H, V3.4H
|
|
SMLSL V23.4S, V5.4H, V2.4H
|
|
REV64 V8.4S, V14.4S
|
|
LD2 {V12.4H, V13.4H}, [X1], #16
|
|
SQNEG V8.4S, V8.4S
|
|
REV64 V11.4H, V11.4h
|
|
REV64 V10.4H, V10.4H
|
|
SQADD V18.4S, V19.4S, V20.4S
|
|
UZP1 V9.8H, V8.8H, V8.8H
|
|
UZP2 V8.8H, V8.8H, V8.8H
|
|
rev64 v9.4s, v9.4s
|
|
rev64 v8.4s, v8.4s
|
|
SQADD V22.4S, V23.4S, V20.4S
|
|
SSHL V18.4S, V18.4S, V21.4S
|
|
SUBS X5, X5, #8
|
|
MOV V16.16B, V18.16B
|
|
ST1 {V16.S}[0], [X7], X8
|
|
SSHL V22.4S, V22.4S, V21.4S
|
|
ST1 {V16.S}[1], [X7], X8
|
|
MOV V24.16B, V22.16B
|
|
|
|
ST1 {V16.S}[2], [X7], X8
|
|
ST1 {V16.S}[3], [X7], X8
|
|
|
|
BGT LOOP_2
|
|
|
|
ST1 {V24.S}[0], [X7], X8
|
|
UMULL V19.4S, V9.4H, V11.4H
|
|
UMLSL V19.4S, V12.4H, V10.4H
|
|
ST1 {V24.S}[1], [X7], X8
|
|
ST1 {V24.S}[2], [X7], X8
|
|
SSHR V19.4S, V19.4S, #16
|
|
ST1 {V24.S}[3], [X7], X8
|
|
|
|
SMLAL V19.4S, V8.4H, V11.4H
|
|
SMLSL V19.4S, V13.4H, V10.4H
|
|
SQADD V18.4S, V19.4S, V20.4S
|
|
SSHL V18.4S, V18.4S, V21.4S
|
|
MOV V16.16B, V18.16B
|
|
|
|
ST1 {V16.S}[0], [X7], X8
|
|
ST1 {V16.S}[1], [X7], X8
|
|
ST1 {V16.S}[2], [X7], X8
|
|
ST1 {V16.S}[3], [X7], X8
|
|
|
|
pop_v_regs
|
|
RET
|