You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
257 lines
7.7 KiB
257 lines
7.7 KiB
/*
|
|
* Copyright (C) 2014 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
|
|
#define END(f) .fnend; .size f, .-f;
|
|
|
|
.eabi_attribute 25,1 @Tag_ABI_align8_preserved
|
|
.arm
|
|
|
|
.macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1
|
|
|
|
vmov r6, r7, \src
|
|
|
|
add r6, r6, r3
|
|
add r7, r7, r3
|
|
|
|
vld1.u8 d16, [r6], r4
|
|
vld1.u8 d17, [r7], r4
|
|
|
|
vld1.u8 d18, [r6], r5
|
|
vld1.u8 d19, [r7], r5
|
|
|
|
vdup.u8 d6, \yr0
|
|
vdup.u8 d7, \yr1
|
|
/* Y interpolate, front, lanes 0 and 1 -> q12 and q13 */
|
|
vshll.u8 q12, d16, #8
|
|
vshll.u8 q13, d17, #8
|
|
vmlsl.u8 q12, d16, d6
|
|
vmlsl.u8 q13, d17, d7
|
|
vmlal.u8 q12, d18, d6
|
|
vmlal.u8 q13, d19, d7
|
|
|
|
vld1.u8 d18, [r6]
|
|
vld1.u8 d19, [r7]
|
|
|
|
sub r6, r6, r4
|
|
sub r7, r7, r4
|
|
|
|
vld1.u8 d16, [r6]
|
|
vld1.u8 d17, [r7]
|
|
|
|
/* Y interpolate, rear, lanes 0 and 1 -> q14 and q15 */
|
|
vshll.u8 q14, d16, #8
|
|
vshll.u8 q15, d17, #8
|
|
vmlsl.u8 q14, d16, d6
|
|
vmlsl.u8 q15, d17, d7
|
|
vmlal.u8 q14, d18, d6
|
|
vmlal.u8 q15, d19, d7
|
|
|
|
/* Z interpolate, lane 0 q12/q14 -> q10 */
|
|
vshll.u16 q8, d24, #8
|
|
vshll.u16 q9, d25, #8
|
|
vmlsl.u16 q8, d24, \zr0
|
|
vmlsl.u16 q9, d25, \zr0
|
|
vmlal.u16 q8, d28, \zr0
|
|
vmlal.u16 q9, d29, \zr0
|
|
vrshrn.u32 d20, q8, #8
|
|
vrshrn.u32 d21, q9, #8
|
|
|
|
/* Z interpolate, lane 1 q13/q15 -> q11 */
|
|
vshll.u16 q8, d26, #8
|
|
vshll.u16 q9, d27, #8
|
|
vmlsl.u16 q8, d26, \zr1
|
|
vmlsl.u16 q9, d27, \zr1
|
|
vmlal.u16 q8, d30, \zr1
|
|
vmlal.u16 q9, d31, \zr1
|
|
vrshrn.u32 d22, q8, #8
|
|
vrshrn.u32 d23, q9, #8
|
|
|
|
/* X interpolate, lanes 0 and 1 q10,q11 -> q14 */
|
|
vshll.u16 q8, d20, #8
|
|
vshll.u16 q9, d22, #8
|
|
vmlsl.u16 q8, d20, \xr0
|
|
vmlsl.u16 q9, d22, \xr1
|
|
vmlal.u16 q8, d21, \xr0
|
|
vmlal.u16 q9, d23, \xr1
|
|
vshrn.u32 d28, q8, #8
|
|
vshrn.u32 d29, q9, #8
|
|
|
|
/* pack lanes 0-1 -> d12 */
|
|
vqrshrn.u16 \dst, q14, #8
|
|
.endm
|
|
|
|
/* void rsdIntrinsic3DLUT_K(
|
|
* void *dst, // r0
|
|
* void const *in, // r1
|
|
* size_t count, // r2
|
|
* void const *lut, // r3
|
|
* int32_t pitchy, // [sp]
|
|
* int32_t pitchz, // [sp+#4]
|
|
* int dimx, // [sp+#8]
|
|
* int dimy, // [sp+#12]
|
|
* int dimz); // [sp+#16]
|
|
*/
|
|
ENTRY(rsdIntrinsic3DLUT_K)
|
|
push {r4,r5,r6,r7}
|
|
ldr r4, [sp, #16]
|
|
ldr r5, [sp, #20]
|
|
ldr r6, [sp, #24]
|
|
ldr r7, [sp, #28]
|
|
ldr r12, [sp, #32]
|
|
vpush {d8-d15}
|
|
|
|
vmov.u8 d8, #1
|
|
vmov.u16 d8[0], r6
|
|
vmov.u16 d8[1], r7
|
|
vmov.u16 d8[2], r12
|
|
vmov d9, r4, r5
|
|
|
|
subs r2, #8
|
|
bge 2f
|
|
cmp r2, #-8
|
|
ble 9f
|
|
b 4f
|
|
|
|
.align 6
|
|
1: vst4.u8 {d12,d13,d14,d15}, [r0]!
|
|
/* r0 = dst
|
|
* r1 = src
|
|
* r2 = count
|
|
* r3 = lut
|
|
* r4 = pitchy
|
|
* r5 = pitchz
|
|
* r6 = offset0
|
|
* r7 = offset1
|
|
*/
|
|
2: vld4.u8 {d0,d2,d4,d6}, [r1]!
|
|
3: vmov d10, d6
|
|
/* q0,q1,q2,q5 source data
|
|
* q4 dimensions and pitches
|
|
* q3, scratch register for scalar access
|
|
*/
|
|
vmov q3, q4
|
|
vmovl.u8 q0, d0
|
|
vmovl.u8 q1, d2
|
|
vmovl.u8 q2, d4
|
|
vmul.u16 q0, q0, d6[0]
|
|
vmul.u16 q1, q1, d6[1]
|
|
vmul.u16 q2, q2, d6[2]
|
|
|
|
/* vrsra.u16 below would be more accurate, but this can result in a dim.0 case
|
|
* where we try to read from the limit of the array and the limit +1 to
|
|
* interpolate, even though the fractional component is zero. Strictly this is
|
|
* correct, except for the llegal access problem.
|
|
*/
|
|
vsra.u16 q0, q0, #8
|
|
vsra.u16 q1, q1, #8
|
|
vsra.u16 q2, q2, #8
|
|
|
|
vshr.u16 q12, q0, #8
|
|
vshr.u16 q13, q1, #8
|
|
vshr.u16 q14, q2, #8
|
|
|
|
vbic.u16 q0, #0xff00
|
|
vmovn.u16 d2, q1
|
|
vbic.u16 q2, #0xff00
|
|
|
|
/* q0,d2,q2 fractional offset
|
|
* q12,q13,q14 integer offset
|
|
*/
|
|
|
|
vshll.u16 q6, d24, #2
|
|
vshll.u16 q7, d25, #2
|
|
vmovl.u16 q8, d26
|
|
vmovl.u16 q9, d27
|
|
vmovl.u16 q10, d28
|
|
vmovl.u16 q11, d29
|
|
vmla.s32 q6, q8, d9[0]
|
|
vmla.s32 q7, q9, d9[0]
|
|
vmla.s32 q6, q10, d9[1]
|
|
vmla.s32 q7, q11, d9[1]
|
|
|
|
/* q6,q7 list of table offsets */
|
|
|
|
/* lanes 0 and 1 */
|
|
lanepair dst=d12, src=d12, xr0=d0[0], xr1=d0[1], yr0=d2[0], yr1=d2[1], zr0=d4[0], zr1=d4[1]
|
|
|
|
/* lanes 2 and 3 */
|
|
lanepair dst=d13, src=d13, xr0=d0[2], xr1=d0[3], yr0=d2[2], yr1=d2[3], zr0=d4[2], zr1=d4[3]
|
|
|
|
/* lanes 4 and 5 */
|
|
lanepair dst=d14, src=d14, xr0=d1[0], xr1=d1[1], yr0=d2[4], yr1=d2[5], zr0=d5[0], zr1=d5[1]
|
|
|
|
/* lanes 6 and 7 */
|
|
lanepair dst=d15, src=d15, xr0=d1[2], xr1=d1[3], yr0=d2[6], yr1=d2[7], zr0=d5[2], zr1=d5[3]
|
|
|
|
vuzp.u8 d12, d13
|
|
vuzp.u8 d14, d15
|
|
vuzp.u8 d12, d14
|
|
vuzp.u8 d13, d15
|
|
|
|
subs r2, r2, #8
|
|
vmov.u8 d15, d10
|
|
|
|
bge 1b
|
|
|
|
cmp r2, #-8
|
|
blt 1f
|
|
|
|
vst4.u8 {d12,d13,d14,d15}, [r0]!
|
|
|
|
beq 9f
|
|
|
|
/* fill the vector with a safe value */
|
|
4: vld1.u32 {d0[]}, [r1]
|
|
vmov d2, d0
|
|
vmov d4, d0
|
|
vmov d6, d0
|
|
tst r2, #4
|
|
beq 2f
|
|
vld1.u32 {d0}, [r1]!
|
|
vld1.u32 {d2}, [r1]!
|
|
2: tst r2, #2
|
|
beq 2f
|
|
vld1.u32 {d4}, [r1]!
|
|
2: tst r2, #1
|
|
beq 2f
|
|
vld1.u32 {d6[0]}, [r1]!
|
|
2: vuzp.8 d0, d2
|
|
vuzp.8 d4, d6
|
|
vuzp.8 d0, d4
|
|
vuzp.8 d2, d6
|
|
b 3b
|
|
|
|
1: vzip.8 d12, d14
|
|
vzip.8 d13, d15
|
|
vzip.8 d12, d13
|
|
vzip.8 d14, d15
|
|
tst r2, #4
|
|
beq 2f
|
|
vst1.u32 {d12,d13}, [r0]!
|
|
2: tst r2, #2
|
|
beq 2f
|
|
vst1.u32 {d14}, [r0]!
|
|
2: tst r2, #1
|
|
beq 9f
|
|
vst1.u32 {d15[0]}, [r0]!
|
|
|
|
9: mov r0, #0
|
|
vpop {d8-d15}
|
|
pop {r4,r5,r6,r7}
|
|
bx lr
|
|
END(rsdIntrinsic3DLUT_K)
|