You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
619 lines
20 KiB
619 lines
20 KiB
/*
|
|
* Copyright (C) 2013-2014 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
|
|
#define END(f) .size f, .-f;
|
|
|
|
#define BLEND_LIST(X) \
|
|
X(0, CLEAR) \
|
|
X(1, SRC) \
|
|
X(2, DST) \
|
|
X(3, SRC_OVER) \
|
|
X(4, DST_OVER) \
|
|
X(5, SRC_IN) \
|
|
X(6, DST_IN) \
|
|
X(7, SRC_OUT) \
|
|
X(8, DST_OUT) \
|
|
X(9, SRC_ATOP) \
|
|
X(10, DST_ATOP) \
|
|
X(11, XOR) \
|
|
X(14, MULTIPLY) \
|
|
X(21, DIFFERENCE) \
|
|
X(34, ADD) \
|
|
X(35, SUBTRACT)
|
|
|
|
/* For every blend operation supported, define a macro with just the arithmetic
|
|
* component. The rest can be handled later on.
|
|
*
|
|
* At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
|
|
* contain the data from the source buffer. Both have already been split out
|
|
* into one colour component per register (if necessary). q3 and q11 contain
|
|
* the alpha components.
|
|
*
|
|
* At the same time as defining the assembly macro, define a corresponding
|
|
* preprocessor macro indicating any other requirements.
|
|
* zipped=0 -- The macro does not require the RGBA components to be
|
|
* separated.
|
|
* lddst=0 -- The macro does not require data from the destination buffer.
|
|
* ldsrc=0 -- The macro does not require data from the source buffer.
|
|
* nowrap=1 -- The macro requires no wrapper at all, and should simply be
|
|
* inserted without any surrounding load/store or loop code.
|
|
*/
|
|
|
|
#define params_CLEAR zipped=0, lddst=0, ldsrc=0
|
|
.macro blend_kernel_CLEAR
|
|
movi v0.16b, #0
|
|
movi v1.16b, #0
|
|
movi v2.16b, #0
|
|
movi v3.16b, #0
|
|
.endm
|
|
|
|
#define params_SRC zipped=0, lddst=0
|
|
.macro blend_kernel_SRC
|
|
mov v0.16b, v8.16b
|
|
mov v1.16b, v9.16b
|
|
mov v2.16b, v10.16b
|
|
mov v3.16b, v11.16b
|
|
.endm
|
|
|
|
#define params_DST nowrap=1
|
|
.macro blend_kernel_DST
|
|
/* nop */
|
|
.endm
|
|
|
|
#define params_SRC_OVER zipped=1
|
|
.macro blend_kernel_SRC_OVER
|
|
mvn v7.16b, v11.16b
|
|
|
|
umull2 v12.8h, v7.16b, v0.16b
|
|
umull v0.8h, v7.8b, v0.8b
|
|
umull2 v13.8h, v7.16b, v1.16b
|
|
umull v1.8h, v7.8b, v1.8b
|
|
umull2 v14.8h, v7.16b, v2.16b
|
|
umull v2.8h, v7.8b, v2.8b
|
|
umull2 v15.8h, v7.16b, v3.16b
|
|
umull v3.8h, v7.8b, v3.8b
|
|
|
|
rshrn v4.8b, v0.8h, #8
|
|
rshrn2 v4.16b, v12.8h, #8
|
|
rshrn v5.8b, v1.8h, #8
|
|
rshrn2 v5.16b, v13.8h, #8
|
|
rshrn v6.8b, v2.8h, #8
|
|
rshrn2 v6.16b, v14.8h, #8
|
|
rshrn v7.8b, v3.8h, #8
|
|
rshrn2 v7.16b, v15.8h, #8
|
|
|
|
uaddw v0.8h, v0.8h, v4.8b
|
|
uaddw2 v12.8h, v12.8h, v4.16b
|
|
uaddw v1.8h, v1.8h, v5.8b
|
|
uaddw2 v13.8h, v13.8h, v5.16b
|
|
uaddw v2.8h, v2.8h, v6.8b
|
|
uaddw2 v14.8h, v14.8h, v6.16b
|
|
uaddw v3.8h, v3.8h, v7.8b
|
|
uaddw2 v15.8h, v15.8h, v7.16b
|
|
|
|
rshrn v0.8b, v0.8h, #8
|
|
rshrn2 v0.16b, v12.8h, #8
|
|
rshrn v1.8b, v1.8h, #8
|
|
rshrn2 v1.16b, v13.8h, #8
|
|
rshrn v2.8b, v2.8h, #8
|
|
rshrn2 v2.16b, v14.8h, #8
|
|
rshrn v3.8b, v3.8h, #8
|
|
rshrn2 v3.16b, v15.8h, #8
|
|
|
|
uqadd v0.16b, v0.16b, v8.16b
|
|
uqadd v1.16b, v1.16b, v9.16b
|
|
uqadd v2.16b, v2.16b, v10.16b
|
|
uqadd v3.16b, v3.16b, v11.16b
|
|
.endm
|
|
|
|
#define params_DST_OVER zipped=1
|
|
.macro blend_kernel_DST_OVER
|
|
mvn v7.16b, v3.16b
|
|
|
|
umull2 v12.8h, v7.16b, v8.16b
|
|
umull v8.8h, v7.8b, v8.8b
|
|
umull2 v13.8h, v7.16b, v9.16b
|
|
umull v9.8h, v7.8b, v9.8b
|
|
umull2 v14.8h, v7.16b, v10.16b
|
|
umull v10.8h, v7.8b, v10.8b
|
|
umull2 v15.8h, v7.16b, v11.16b
|
|
umull v11.8h, v7.8b, v11.8b
|
|
|
|
rshrn v4.8b, v8.8h, #8
|
|
rshrn2 v4.16b, v12.8h, #8
|
|
rshrn v5.8b, v9.8h, #8
|
|
rshrn2 v5.16b, v13.8h, #8
|
|
rshrn v6.8b, v10.8h, #8
|
|
rshrn2 v6.16b, v14.8h, #8
|
|
rshrn v7.8b, v11.8h, #8
|
|
rshrn2 v7.16b, v15.8h, #8
|
|
|
|
uaddw v8.8h, v8.8h, v4.8b
|
|
uaddw2 v12.8h, v12.8h, v4.16b
|
|
uaddw v9.8h, v9.8h, v5.8b
|
|
uaddw2 v13.8h, v13.8h, v5.16b
|
|
uaddw v10.8h, v10.8h, v6.8b
|
|
uaddw2 v14.8h, v14.8h, v6.16b
|
|
uaddw v11.8h, v11.8h, v7.8b
|
|
uaddw2 v15.8h, v15.8h, v7.16b
|
|
|
|
rshrn v8.8b, v8.8h, #8
|
|
rshrn2 v8.16b, v12.8h, #8
|
|
rshrn v9.8b, v9.8h, #8
|
|
rshrn2 v9.16b, v13.8h, #8
|
|
rshrn v10.8b, v10.8h, #8
|
|
rshrn2 v10.16b, v14.8h, #8
|
|
rshrn v11.8b, v11.8h, #8
|
|
rshrn2 v11.16b, v15.8h, #8
|
|
|
|
uqadd v0.16b, v0.16b, v8.16b
|
|
uqadd v1.16b, v1.16b, v9.16b
|
|
uqadd v2.16b, v2.16b, v10.16b
|
|
uqadd v3.16b, v3.16b, v11.16b
|
|
.endm
|
|
|
|
#define params_SRC_IN zipped=1
|
|
.macro blend_kernel_SRC_IN
|
|
umull2 v12.8h, v3.16b, v8.16b
|
|
umull v0.8h, v3.8b, v8.8b
|
|
umull2 v13.8h, v3.16b, v9.16b
|
|
umull v1.8h, v3.8b, v9.8b
|
|
umull2 v14.8h, v3.16b, v10.16b
|
|
umull v2.8h, v3.8b, v10.8b
|
|
umull2 v15.8h, v3.16b, v11.16b
|
|
umull v3.8h, v3.8b, v11.8b
|
|
|
|
rshrn v4.8b, v0.8h, #8
|
|
rshrn2 v4.16b, v12.8h, #8
|
|
rshrn v5.8b, v1.8h, #8
|
|
rshrn2 v5.16b, v13.8h, #8
|
|
rshrn v6.8b, v2.8h, #8
|
|
rshrn2 v6.16b, v14.8h, #8
|
|
rshrn v7.8b, v3.8h, #8
|
|
rshrn2 v7.16b, v15.8h, #8
|
|
|
|
uaddw v0.8h, v0.8h, v4.8b
|
|
uaddw2 v12.8h, v12.8h, v4.16b
|
|
uaddw v1.8h, v1.8h, v5.8b
|
|
uaddw2 v13.8h, v13.8h, v5.16b
|
|
uaddw v2.8h, v2.8h, v6.8b
|
|
uaddw2 v14.8h, v14.8h, v6.16b
|
|
uaddw v3.8h, v3.8h, v7.8b
|
|
uaddw2 v15.8h, v15.8h, v7.16b
|
|
|
|
rshrn v0.8b, v0.8h, #8
|
|
rshrn2 v0.16b, v12.8h, #8
|
|
rshrn v1.8b, v1.8h, #8
|
|
rshrn2 v1.16b, v13.8h, #8
|
|
rshrn v2.8b, v2.8h, #8
|
|
rshrn2 v2.16b, v14.8h, #8
|
|
rshrn v3.8b, v3.8h, #8
|
|
rshrn2 v3.16b, v15.8h, #8
|
|
.endm
|
|
|
|
#define params_DST_IN zipped=1
|
|
.macro blend_kernel_DST_IN
|
|
umull2 v12.8h, v0.16b, v11.16b
|
|
umull v0.8h, v0.8b, v11.8b
|
|
umull2 v13.8h, v1.16b, v11.16b
|
|
umull v1.8h, v1.8b, v11.8b
|
|
umull2 v14.8h, v2.16b, v11.16b
|
|
umull v2.8h, v2.8b, v11.8b
|
|
umull2 v15.8h, v3.16b, v11.16b
|
|
umull v3.8h, v3.8b, v11.8b
|
|
|
|
rshrn v4.8b, v0.8h, #8
|
|
rshrn2 v4.16b, v12.8h, #8
|
|
rshrn v5.8b, v1.8h, #8
|
|
rshrn2 v5.16b, v13.8h, #8
|
|
rshrn v6.8b, v2.8h, #8
|
|
rshrn2 v6.16b, v14.8h, #8
|
|
rshrn v7.8b, v3.8h, #8
|
|
rshrn2 v7.16b, v15.8h, #8
|
|
|
|
uaddw v0.8h, v0.8h, v4.8b
|
|
uaddw2 v12.8h, v12.8h, v4.16b
|
|
uaddw v1.8h, v1.8h, v5.8b
|
|
uaddw2 v13.8h, v13.8h, v5.16b
|
|
uaddw v2.8h, v2.8h, v6.8b
|
|
uaddw2 v14.8h, v14.8h, v6.16b
|
|
uaddw v3.8h, v3.8h, v7.8b
|
|
uaddw2 v15.8h, v15.8h, v7.16b
|
|
|
|
rshrn v0.8b, v0.8h, #8
|
|
rshrn2 v0.16b, v12.8h, #8
|
|
rshrn v1.8b, v1.8h, #8
|
|
rshrn2 v1.16b, v13.8h, #8
|
|
rshrn v2.8b, v2.8h, #8
|
|
rshrn2 v2.16b, v14.8h, #8
|
|
rshrn v3.8b, v3.8h, #8
|
|
rshrn2 v3.16b, v15.8h, #8
|
|
.endm
|
|
|
|
#define params_SRC_OUT zipped=1
|
|
.macro blend_kernel_SRC_OUT
|
|
mvn v3.16b, v3.16b
|
|
blend_kernel_SRC_IN
|
|
.endm
|
|
|
|
|
|
#define params_DST_OUT zipped=1
|
|
.macro blend_kernel_DST_OUT
|
|
mvn v11.16b, v11.16b
|
|
blend_kernel_DST_IN
|
|
.endm
|
|
|
|
#define params_SRC_ATOP zipped=1
|
|
.macro blend_kernel_SRC_ATOP
|
|
mvn v11.16b, v11.16b
|
|
|
|
umull2 v12.8h, v11.16b, v0.16b
|
|
umull v0.8h, v11.8b, v0.8b
|
|
umull2 v13.8h, v11.16b, v1.16b
|
|
umull v1.8h, v11.8b, v1.8b
|
|
umull2 v14.8h, v11.16b, v2.16b
|
|
umull v2.8h, v11.8b, v2.8b
|
|
|
|
umull2 v4.8h, v3.16b, v8.16b
|
|
umull v8.8h, v3.8b, v8.8b
|
|
umull2 v5.8h, v3.16b, v9.16b
|
|
umull v9.8h, v3.8b, v9.8b
|
|
umull2 v6.8h, v3.16b, v10.16b
|
|
umull v10.8h, v3.8b, v10.8b
|
|
|
|
uqadd v12.8h, v12.8h, v4.8h
|
|
uqadd v0.8h, v0.8h, v8.8h
|
|
uqadd v13.8h, v13.8h, v5.8h
|
|
uqadd v1.8h, v1.8h, v9.8h
|
|
uqadd v14.8h, v14.8h, v6.8h
|
|
uqadd v2.8h, v2.8h, v10.8h
|
|
|
|
urshr v8.8h, v0.8h, #8
|
|
urshr v4.8h, v12.8h, #8
|
|
urshr v9.8h, v1.8h, #8
|
|
urshr v5.8h, v13.8h, #8
|
|
urshr v10.8h, v2.8h, #8
|
|
urshr v6.8h, v14.8h, #8
|
|
|
|
uqadd v0.8h, v0.8h, v8.8h
|
|
uqadd v12.8h, v12.8h, v4.8h
|
|
uqadd v1.8h, v1.8h, v9.8h
|
|
uqadd v13.8h, v13.8h, v5.8h
|
|
uqadd v2.8h, v2.8h, v10.8h
|
|
uqadd v14.8h, v14.8h, v6.8h
|
|
|
|
uqrshrn v0.8b, v0.8h, #8
|
|
uqrshrn2 v0.16b, v12.8h, #8
|
|
uqrshrn v1.8b, v1.8h, #8
|
|
uqrshrn2 v1.16b, v13.8h, #8
|
|
uqrshrn v2.8b, v2.8h, #8
|
|
uqrshrn2 v2.16b, v14.8h, #8
|
|
.endm
|
|
|
|
#define params_DST_ATOP zipped=1
|
|
.macro blend_kernel_DST_ATOP
|
|
mvn v3.16b, v3.16b
|
|
|
|
umull2 v12.8h, v11.16b, v0.16b
|
|
umull v0.8h, v11.8b, v0.8b
|
|
umull2 v13.8h, v11.16b, v1.16b
|
|
umull v1.8h, v11.8b, v1.8b
|
|
umull2 v14.8h, v11.16b, v2.16b
|
|
umull v2.8h, v11.8b, v2.8b
|
|
|
|
umull2 v4.8h, v3.16b, v8.16b
|
|
umull v8.8h, v3.8b, v8.8b
|
|
umull2 v5.8h, v3.16b, v9.16b
|
|
umull v9.8h, v3.8b, v9.8b
|
|
umull2 v6.8h, v3.16b, v10.16b
|
|
umull v10.8h, v3.8b, v10.8b
|
|
|
|
uqadd v12.8h, v12.8h, v4.8h
|
|
uqadd v0.8h, v0.8h, v8.8h
|
|
uqadd v13.8h, v13.8h, v5.8h
|
|
uqadd v1.8h, v1.8h, v9.8h
|
|
uqadd v14.8h, v14.8h, v6.8h
|
|
uqadd v2.8h, v2.8h, v10.8h
|
|
|
|
urshr v8.8h, v0.8h, #8
|
|
urshr v4.8h, v12.8h, #8
|
|
urshr v9.8h, v1.8h, #8
|
|
urshr v5.8h, v13.8h, #8
|
|
urshr v10.8h, v2.8h, #8
|
|
urshr v6.8h, v14.8h, #8
|
|
|
|
uqadd v0.8h, v0.8h, v8.8h
|
|
uqadd v12.8h, v12.8h, v4.8h
|
|
uqadd v1.8h, v1.8h, v9.8h
|
|
uqadd v13.8h, v13.8h, v5.8h
|
|
uqadd v2.8h, v2.8h, v10.8h
|
|
uqadd v14.8h, v14.8h, v6.8h
|
|
|
|
uqrshrn v0.8b, v0.8h, #8
|
|
uqrshrn2 v0.16b, v12.8h, #8
|
|
uqrshrn v1.8b, v1.8h, #8
|
|
uqrshrn2 v1.16b, v13.8h, #8
|
|
uqrshrn v2.8b, v2.8h, #8
|
|
uqrshrn2 v2.16b, v14.8h, #8
|
|
|
|
mov v3.16b, v11.16b
|
|
.endm
|
|
|
|
#define params_MULTIPLY zipped=0
|
|
.macro blend_kernel_MULTIPLY
|
|
umull2 v12.8h, v0.16b, v8.16b
|
|
umull v0.8h, v0.8b, v8.8b
|
|
umull2 v13.8h, v1.16b, v9.16b
|
|
umull v1.8h, v1.8b, v9.8b
|
|
umull2 v14.8h, v2.16b, v10.16b
|
|
umull v2.8h, v2.8b, v10.8b
|
|
umull2 v15.8h, v3.16b, v11.16b
|
|
umull v3.8h, v3.8b, v11.8b
|
|
|
|
rshrn v4.8b, v0.8h, #8
|
|
rshrn2 v4.16b, v12.8h, #8
|
|
rshrn v5.8b, v1.8h, #8
|
|
rshrn2 v5.16b, v13.8h, #8
|
|
rshrn v6.8b, v2.8h, #8
|
|
rshrn2 v6.16b, v14.8h, #8
|
|
rshrn v7.8b, v3.8h, #8
|
|
rshrn2 v7.16b, v15.8h, #8
|
|
|
|
uaddw v0.8h, v0.8h, v4.8b
|
|
uaddw2 v12.8h, v12.8h, v4.16b
|
|
uaddw v1.8h, v1.8h, v5.8b
|
|
uaddw2 v13.8h, v13.8h, v5.16b
|
|
uaddw v2.8h, v2.8h, v6.8b
|
|
uaddw2 v14.8h, v14.8h, v6.16b
|
|
uaddw v3.8h, v3.8h, v7.8b
|
|
uaddw2 v15.8h, v15.8h, v7.16b
|
|
|
|
rshrn v0.8b, v0.8h, #8
|
|
rshrn2 v0.16b, v12.8h, #8
|
|
rshrn v1.8b, v1.8h, #8
|
|
rshrn2 v1.16b, v13.8h, #8
|
|
rshrn v2.8b, v2.8h, #8
|
|
rshrn2 v2.16b, v14.8h, #8
|
|
rshrn v3.8b, v3.8h, #8
|
|
rshrn2 v3.16b, v15.8h, #8
|
|
.endm
|
|
|
|
#define params_ADD zipped=0
|
|
.macro blend_kernel_ADD
|
|
uqadd v0.16b, v0.16b, v8.16b
|
|
uqadd v1.16b, v1.16b, v9.16b
|
|
uqadd v2.16b, v2.16b, v10.16b
|
|
uqadd v3.16b, v3.16b, v11.16b
|
|
.endm
|
|
|
|
#define params_SUBTRACT zipped=0
|
|
.macro blend_kernel_SUBTRACT
|
|
uqsub v0.16b, v0.16b, v8.16b
|
|
uqsub v1.16b, v1.16b, v9.16b
|
|
uqsub v2.16b, v2.16b, v10.16b
|
|
uqsub v3.16b, v3.16b, v11.16b
|
|
.endm
|
|
|
|
#define params_DIFFERENCE zipped=0
|
|
.macro blend_kernel_DIFFERENCE
|
|
uabd v0.16b, v0.16b, v8.16b
|
|
uabd v1.16b, v1.16b, v9.16b
|
|
uabd v2.16b, v2.16b, v10.16b
|
|
uabd v3.16b, v3.16b, v11.16b
|
|
.endm
|
|
|
|
#define params_XOR zipped=0
|
|
.macro blend_kernel_XOR
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
.endm
|
|
|
|
|
|
/* Define the wrapper code which will load and store the data, iterate the
|
|
* correct number of times, and safely handle the remainder at the end of the
|
|
* loop. Various sections of assembly code are dropped or substituted for
|
|
* simpler operations if they're not needed.
|
|
*/
|
|
.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
|
|
.if \nowrap
|
|
\kernel
|
|
.else
|
|
sub x3, sp, #32
|
|
sub sp, sp, #64
|
|
st1 {v8.1d - v11.1d}, [sp]
|
|
st1 {v12.1d - v15.1d}, [x3]
|
|
subs x2, x2, #64
|
|
b 2f
|
|
.align 4
|
|
1:
|
|
.if \lddst
|
|
.if \zipped
|
|
ld4 {v0.16b - v3.16b}, [x0]
|
|
.else
|
|
ld1 {v0.16b - v3.16b}, [x0]
|
|
.endif
|
|
.endif
|
|
.if \ldsrc
|
|
.if \zipped
|
|
ld4 {v8.16b - v11.16b}, [x1], #64
|
|
.else
|
|
ld1 {v8.16b - v11.16b}, [x1], #64
|
|
.endif
|
|
.endif
|
|
.if \pld
|
|
#if 0 /* TODO: test this on real hardware */
|
|
.if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif
|
|
.if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif
|
|
#endif
|
|
.endif
|
|
|
|
\kernel
|
|
|
|
subs x2, x2, #64
|
|
.if \zipped
|
|
st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
|
|
.else
|
|
st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
|
|
.endif
|
|
|
|
2: bge 1b
|
|
adds x2, x2, #64
|
|
beq 2f
|
|
|
|
/* To handle the tail portion of the data (something less than 64
|
|
* bytes) load small power-of-two chunks into working registers. It
|
|
* doesn't matter where they end up in the register; the same process
|
|
* will store them back out using the same positions and the operations
|
|
* don't require data to interact with its neighbours.
|
|
*/
|
|
movi v0.16b, #0
|
|
movi v1.16b, #0
|
|
movi v2.16b, #0
|
|
movi v3.16b, #0
|
|
|
|
movi v8.16b, #0
|
|
movi v9.16b, #0
|
|
movi v10.16b, #0
|
|
movi v11.16b, #0
|
|
|
|
tbz x2, #5, 1f
|
|
.if \lddst ; ld1 {v2.16b,v3.16b}, [x0], #32 ; .endif
|
|
.if \ldsrc ; ld1 {v10.16b,v11.16b}, [x1], #32 ; .endif
|
|
1: tbz x2, #4, 1f
|
|
.if \lddst ; ld1 {v1.16b}, [x0], #16 ; .endif
|
|
.if \ldsrc ; ld1 {v9.16b}, [x1], #16 ; .endif
|
|
1: tbz x2, #3, 1f
|
|
.if \lddst ; ld1 {v0.d}[1], [x0], #8 ; .endif
|
|
.if \ldsrc ; ld1 {v8.d}[1], [x1], #8 ; .endif
|
|
1: tbz x2, #2, 1f
|
|
.if \lddst ; ld1 {v0.s}[1], [x0], #4 ; .endif
|
|
.if \ldsrc ; ld1 {v8.s}[1], [x1], #4 ; .endif
|
|
1: tbz x2, #1, 1f
|
|
.if \lddst ; ld1 {v0.h}[1], [x0], #2 ; .endif
|
|
.if \ldsrc ; ld1 {v8.h}[1], [x1], #2 ; .endif
|
|
1: tbz x2, #0, 1f
|
|
.if \lddst ; ld1 {v0.b}[1], [x0], #1 ; .endif
|
|
.if \ldsrc ; ld1 {v8.b}[1], [x1], #1 ; .endif
|
|
1:
|
|
.if \lddst ; sub x0, x0, x2 ; .endif
|
|
|
|
.if \zipped
|
|
/* One small impediment in the process above is that some of the load
|
|
* operations can't perform byte-wise structure deinterleaving at the
|
|
* same time as loading only part of a register. So the data is loaded
|
|
* linearly and unpacked manually at this point.
|
|
*/
|
|
uzp1 v4.16b, v0.16b, v1.16b
|
|
uzp2 v5.16b, v0.16b, v1.16b
|
|
uzp1 v6.16b, v2.16b, v3.16b
|
|
uzp2 v7.16b, v2.16b, v3.16b
|
|
uzp1 v0.16b, v4.16b, v6.16b
|
|
uzp2 v2.16b, v4.16b, v6.16b
|
|
uzp1 v1.16b, v5.16b, v7.16b
|
|
uzp2 v3.16b, v5.16b, v7.16b
|
|
|
|
uzp1 v4.16b, v8.16b, v9.16b
|
|
uzp2 v5.16b, v8.16b, v9.16b
|
|
uzp1 v6.16b, v10.16b, v11.16b
|
|
uzp2 v7.16b, v10.16b, v11.16b
|
|
uzp1 v8.16b, v4.16b, v6.16b
|
|
uzp2 v10.16b, v4.16b, v6.16b
|
|
uzp1 v9.16b, v5.16b, v7.16b
|
|
uzp2 v11.16b, v5.16b, v7.16b
|
|
|
|
\kernel
|
|
|
|
zip1 v4.16b, v0.16b, v2.16b
|
|
zip2 v6.16b, v0.16b, v2.16b
|
|
zip1 v5.16b, v1.16b, v3.16b
|
|
zip2 v7.16b, v1.16b, v3.16b
|
|
zip1 v0.16b, v4.16b, v5.16b
|
|
zip2 v1.16b, v4.16b, v5.16b
|
|
zip1 v2.16b, v6.16b, v7.16b
|
|
zip2 v3.16b, v6.16b, v7.16b
|
|
.else
|
|
\kernel
|
|
.endif
|
|
|
|
tbz x2, #5, 1f
|
|
st1 {v2.16b,v3.16b}, [x0], #32
|
|
1: tbz x2, #4, 1f
|
|
st1 {v1.16b}, [x0], #16
|
|
1: tbz x2, #3, 1f
|
|
st1 {v0.d}[1], [x0], #8
|
|
1: tbz x2, #2, 1f
|
|
st1 {v0.s}[1], [x0], #4
|
|
1: tbz x2, #1, 1f
|
|
st1 {v0.h}[1], [x0], #2
|
|
1: tbz x2, #0, 2f
|
|
st1 {v0.b}[1], [x0], #1
|
|
2: ld1 {v8.1d - v11.1d}, [sp], #32
|
|
ld1 {v12.1d - v15.1d}, [sp], #32
|
|
.endif
|
|
mov x0, #0
|
|
ret
|
|
.endm
|
|
|
|
|
|
/* produce list of blend_line_XX() functions; each function uses the wrap_line
|
|
* macro, passing it the name of the operation macro it wants along with
|
|
* optional parameters to remove unnecessary operations.
|
|
*/
|
|
#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
|
|
BLEND_LIST(BLEND_X)
|
|
#undef BLEND_X
|
|
|
|
#define BLEND_X(d, n) .set tablesize, d+1 ;
|
|
BLEND_LIST(BLEND_X)
|
|
#undef BLEND_X
|
|
|
|
/* int rsdIntrinsicBlend_K(
|
|
* uchar4 *out, // x0
|
|
* uchar4 const *in, // x1
|
|
* int slot, // x2
|
|
* size_t xstart, // x3
|
|
* size_t xend); // x4
|
|
*/
|
|
ENTRY(rsdIntrinsicBlend_K)
|
|
adrp x5, blendtable
|
|
add x5, x5, :lo12:blendtable
|
|
cmp w2, tablesize >> 1
|
|
bhs 1f
|
|
ldrsh x6, [x5, w2, uxtw #1]
|
|
add x0, x0, w3, uxtw #2
|
|
add x1, x1, w3, uxtw #2
|
|
sub w2, w4, w3
|
|
ubfiz x2, x2, #2, #32 /* TODO: fix */
|
|
cbz x6, 1f
|
|
adr x5, 2f
|
|
add x6, x5, x6
|
|
2: br x6
|
|
1: mov x0, #-1
|
|
ret
|
|
|
|
END(rsdIntrinsicBlend_K)
|
|
|
|
.rodata
|
|
.set off,0
|
|
blendtable:
|
|
#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
|
|
BLEND_LIST(BLEND_X)
|
|
#undef BLEND_X
|
|
|