You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
378 lines
14 KiB
378 lines
14 KiB
4 months ago
|
/*
|
||
|
* Copyright (C) 2014 The Android Open Source Project
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
|
||
|
#define END(f) .size f, .-f;
|
||
|
|
||
|
/* Perform the actual YuvToRGB conversion in a macro, from register to
|
||
|
* register. This macro will be called from within several different wrapper
|
||
|
* variants for different data layouts. Y data starts with the even and odd
|
||
|
* bytes split into the low parts of v8 and v9 respectively. U and V are in
|
||
|
* v10 and v11. Working constants are pre-loaded into v24-v31, and v3 and v7
|
||
|
* are pre-loaded with a constant 0xff alpha channel.
|
||
|
*
|
||
|
* The complicated arithmetic is the result of refactoring the original
|
||
|
* equations to avoid 16-bit overflow without losing any precision.
|
||
|
*/
|
||
|
.macro yuvkern, regu=v10, regv=v11
|
||
|
/* v0 out R_lo / even R_lo accumulator
|
||
|
* v1 out G_lo / even G_lo accumulator
|
||
|
* v2 out B_lo / even B_lo accumulator
|
||
|
* v3 out A_lo / const 0xff*ff
|
||
|
* v4 out R_hi / even R_hi accumulator
|
||
|
* v5 out G_hi / even G_hi accumulator
|
||
|
* v6 out B_hi / even B_hi accumulator
|
||
|
* v7 out A_hi / const 0xff*ff
|
||
|
* v8 even Y / G_lo luma tmp
|
||
|
* v9 odd Y / G_lo luma tmp
|
||
|
* \regu in U
|
||
|
* \regv in V
|
||
|
* v12 R_lo luma tmp
|
||
|
* v13 B_lo luma tmp
|
||
|
* v14 R_hi luma tmp
|
||
|
* v15 B_hi luma tmp
|
||
|
* v16 odd R_lo accumulator
|
||
|
* v17 odd G_lo accumulator
|
||
|
* v18 odd B_lo accumulator
|
||
|
* v19 multiplier extra bits low
|
||
|
* v20 odd R_hi accumulator
|
||
|
* v21 odd G_hi accumulator
|
||
|
* v22 odd B_hi accumulator
|
||
|
* v23 multiplier extra bits high
|
||
|
* v24 constant 149
|
||
|
* v25 constant 50
|
||
|
* v26 constant 104
|
||
|
* v27 constant 204
|
||
|
* v28 constant 254
|
||
|
* v29 constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
|
||
|
* v30 constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
|
||
|
* v31 constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1)
|
||
|
*/
|
||
|
|
||
|
umull v1.8h, v8.8b, v24.8b // g0 = y0 * 149
|
||
|
umull v17.8h, v9.8b, v24.8b // g1 = y1 * 149
|
||
|
umull2 v5.8h, v8.16b, v24.16b // g0_hi = y0_hi * 149
|
||
|
umull2 v21.8h, v9.16b, v24.16b // g1_hi = y1_hi * 149
|
||
|
|
||
|
umull v8.8h, \regu\().8b, v25.8b // g2 = u * 50 + v * 104
|
||
|
umlal v8.8h, \regv\().8b, v26.8b
|
||
|
umull2 v9.8h, \regu\().16b, v25.16b // g2_hi = u_hi * 50 + v_hi * 104
|
||
|
umlal2 v9.8h, \regv\().16b, v26.16b
|
||
|
|
||
|
ushr v19.16b, \regv\().16b, #1
|
||
|
uaddw v0.8h, v1.8h, v19.8b // r0 = g0 + (v >> 1)
|
||
|
uaddw v16.8h, v17.8h, v19.8b // r1 = g1 + (v >> 1)
|
||
|
|
||
|
uaddw2 v4.8h, v5.8h, v19.16b // r0_hi = g0_hi + (v_hi >> 1)
|
||
|
uaddw2 v20.8h, v21.8h, v19.16b // r1_hi = g1_hi + (v_hi >> 1)
|
||
|
|
||
|
ushll v19.8h, \regu\().8b, #2
|
||
|
ushll2 v23.8h, \regu\().16b, #2
|
||
|
add v2.8h, v1.8h, v19.8h // b0 = g0 + (u << 2)
|
||
|
add v18.8h, v17.8h, v19.8h // b1 = g1 + (u << 2)
|
||
|
|
||
|
add v6.8h, v5.8h, v23.8h // b0_hi = g0_hi + (u_hi << 2)
|
||
|
add v22.8h, v21.8h, v23.8h // b1_hi = g1_hi + (u_hi << 2)
|
||
|
|
||
|
umull v12.8h, \regv\().8b, v27.8b // r2 = v * 204
|
||
|
umull v13.8h, \regu\().8b, v28.8b // b2 = u * 254
|
||
|
|
||
|
umull2 v14.8h, \regv\().16b, v27.16b // r2_hi = v_hi * 204
|
||
|
umull2 v15.8h, \regu\().16b, v28.16b // b2_hi = u_hi * 254
|
||
|
|
||
|
uhadd v0.8h, v0.8h, v12.8h // r0 = (r0 + r2) >> 1
|
||
|
uhadd v16.8h, v16.8h, v12.8h // r1 = (r1 + r2) >> 1
|
||
|
uqadd v1.8h, v1.8h, v30.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
|
||
|
uqadd v17.8h, v17.8h, v30.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
|
||
|
uhadd v2.8h, v2.8h, v13.8h // b0 = (b0 + b2) >> 1
|
||
|
uhadd v18.8h, v18.8h, v13.8h // b1 = (b1 + b2) >> 1
|
||
|
|
||
|
uhadd v4.8h, v4.8h, v14.8h // r0_hi = (r0_hi + r2_hi) >> 1
|
||
|
uhadd v20.8h, v20.8h, v14.8h // r1_hi = (r1_hi + r2_hi) >> 1
|
||
|
uqadd v5.8h, v5.8h, v30.8h // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
|
||
|
uqadd v21.8h, v21.8h, v30.8h // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
|
||
|
uhadd v6.8h, v6.8h, v15.8h // b0_hi = (b0_hi + b2_hi) >> 1
|
||
|
uhadd v22.8h, v22.8h, v15.8h // b1_hi = (b1_hi + b2_hi) >> 1
|
||
|
|
||
|
uqsub v0.8h, v0.8h, v29.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
|
||
|
uqsub v16.8h, v16.8h, v29.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
|
||
|
uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2)
|
||
|
uqsub v17.8h, v17.8h, v8.8h // g1 = satu16(g1 - g2)
|
||
|
uqsub v2.8h, v2.8h, v31.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
|
||
|
uqsub v18.8h, v18.8h, v31.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
|
||
|
|
||
|
uqsub v4.8h, v4.8h, v29.8h // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
|
||
|
uqsub v20.8h, v20.8h, v29.8h // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
|
||
|
uqsub v5.8h, v5.8h, v9.8h // g0_hi = satu16(g0_hi - g2_hi)
|
||
|
uqsub v21.8h, v21.8h, v9.8h // g1_hi = satu16(g1_hi - g2_hi)
|
||
|
uqsub v6.8h, v6.8h, v31.8h // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
|
||
|
uqsub v22.8h, v22.8h, v31.8h // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
|
||
|
|
||
|
uqrshrn v0.8b, v0.8h, #6
|
||
|
uqrshrn v16.8b, v16.8h, #6
|
||
|
uqrshrn v1.8b, v1.8h, #7
|
||
|
uqrshrn v17.8b, v17.8h, #7
|
||
|
uqrshrn v2.8b, v2.8h, #6
|
||
|
uqrshrn v18.8b, v18.8h, #6
|
||
|
|
||
|
uqrshrn v4.8b, v4.8h, #6
|
||
|
uqrshrn v20.8b, v20.8h, #6
|
||
|
uqrshrn v5.8b, v5.8h, #7
|
||
|
uqrshrn v21.8b, v21.8h, #7
|
||
|
uqrshrn v6.8b, v6.8h, #6
|
||
|
uqrshrn v22.8b, v22.8h, #6
|
||
|
|
||
|
zip1 v0.16b, v0.16b, v16.16b
|
||
|
zip1 v1.16b, v1.16b, v17.16b
|
||
|
zip1 v2.16b, v2.16b, v18.16b
|
||
|
|
||
|
zip1 v4.16b, v4.16b, v20.16b
|
||
|
zip1 v5.16b, v5.16b, v21.16b
|
||
|
zip1 v6.16b, v6.16b, v22.16b
|
||
|
.endm
|
||
|
|
||
|
/* Define the wrapper code which will load and store the data, iterate the
|
||
|
* correct number of times, and safely handle the remainder at the end of the
|
||
|
* loop. Some sections of code are switched out depending on the data packing
|
||
|
* being handled.
|
||
|
*/
|
||
|
.macro wrap_line kernel, interleaved=0, swapuv=0
|
||
|
movi v24.16b, #149
|
||
|
movi v25.16b, #50
|
||
|
movi v26.16b, #104
|
||
|
movi v27.16b, #204
|
||
|
movi v28.16b, #254
|
||
|
mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
|
||
|
dup v29.8h, w5
|
||
|
mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
|
||
|
dup v30.8h, w5
|
||
|
mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
|
||
|
dup v31.8h, w5
|
||
|
|
||
|
movi v3.16b, #0xff
|
||
|
movi v7.16b, #0xff
|
||
|
|
||
|
subs x2, x2, #32
|
||
|
bhs 1f
|
||
|
b 2f
|
||
|
|
||
|
.align 4
|
||
|
1: ld2 {v8.16b,v9.16b}, [x1], #32
|
||
|
.if \interleaved
|
||
|
ld2 {v10.16b,v11.16b}, [x3], #32
|
||
|
.else
|
||
|
ld1 {v10.16b}, [x3], #16
|
||
|
ld1 {v11.16b}, [x4], #16
|
||
|
.endif
|
||
|
|
||
|
.if \swapuv
|
||
|
\kernel regu=v11, regv=v10
|
||
|
.else
|
||
|
\kernel
|
||
|
.endif
|
||
|
|
||
|
subs x2, x2, #32
|
||
|
|
||
|
st4 {v0.16b - v3.16b}, [x0], #64
|
||
|
st4 {v4.16b - v7.16b}, [x0], #64
|
||
|
|
||
|
bhs 1b
|
||
|
|
||
|
2: adds x2, x2, #32
|
||
|
beq 2f
|
||
|
|
||
|
/* To handle the tail portion of the data (something less than 32
|
||
|
* bytes) load small power-of-two chunks into working registers. It
|
||
|
* doesn't matter where they end up in the register; the same process
|
||
|
* will store them back out using the same positions and the
|
||
|
* interaction between neighbouring pixels is constrained to odd
|
||
|
* boundaries where the load operations don't interfere.
|
||
|
*/
|
||
|
movi v8.8b, #0
|
||
|
movi v9.8b, #0
|
||
|
movi v10.8b, #0
|
||
|
movi v11.8b, #0
|
||
|
|
||
|
tbz x2, #4, 1f
|
||
|
ld1 {v9.16b}, [x1], #16
|
||
|
.if \interleaved
|
||
|
ld1 {v11.16b}, [x3], #16
|
||
|
.else
|
||
|
ld1 {v10.d}[1], [x3], #8
|
||
|
ld1 {v11.d}[1], [x4], #8
|
||
|
.endif
|
||
|
1: tbz x2, #3, 1f
|
||
|
ld1 {v8.d}[1], [x1], #8
|
||
|
.if \interleaved
|
||
|
ld1 {v10.d}[1], [x3], #8
|
||
|
.else
|
||
|
ld1 {v10.s}[1], [x3], #4
|
||
|
ld1 {v11.s}[1], [x4], #4
|
||
|
.endif
|
||
|
1: tbz x2, #2, 1f
|
||
|
ld1 {v8.s}[1], [x1], #4
|
||
|
.if \interleaved
|
||
|
ld1 {v10.s}[1], [x3], #4
|
||
|
.else
|
||
|
ld1 {v10.h}[1], [x3], #2
|
||
|
ld1 {v11.h}[1], [x4], #2
|
||
|
.endif
|
||
|
1: tbz x2, #1, 1f
|
||
|
ld1 {v8.h}[1], [x1], #2
|
||
|
.if \interleaved
|
||
|
ld1 {v10.h}[1], [x3], #2
|
||
|
.else
|
||
|
ld1 {v10.b}[1], [x3], #1
|
||
|
ld1 {v11.b}[1], [x4], #1
|
||
|
.endif
|
||
|
1: tbz x2, #0, 1f
|
||
|
ld1 {v8.b}[1], [x1], #1
|
||
|
.if \interleaved
|
||
|
ld1 {v10.h}[0], [x3], #2
|
||
|
.else
|
||
|
ld1 {v10.b}[0], [x3], #1
|
||
|
ld1 {v11.b}[0], [x4], #1
|
||
|
.endif
|
||
|
|
||
|
/* One small impediment in the process above is that some of the load
|
||
|
* operations can't perform byte-wise structure deinterleaving at the
|
||
|
* same time as loading only part of a register. So the data is loaded
|
||
|
* linearly and unpacked manually at this point if necessary.
|
||
|
*/
|
||
|
1: mov v12.16b, v8.16b
|
||
|
uzp1 v8.16b, v12.16b, v9.16b
|
||
|
uzp2 v9.16b, v12.16b, v9.16b
|
||
|
.if \interleaved
|
||
|
mov v12.16b, v10.16b
|
||
|
uzp1 v10.16b, v12.16b, v11.16b
|
||
|
uzp2 v11.16b, v12.16b, v11.16b
|
||
|
.endif
|
||
|
|
||
|
.if \swapuv
|
||
|
\kernel regu=v11, regv=v10
|
||
|
.else
|
||
|
\kernel
|
||
|
.endif
|
||
|
|
||
|
/* As above but with the output; structured stores for partial vectors
|
||
|
* aren't available, so the data is re-packed first and stored linearly.
|
||
|
*/
|
||
|
zip1 v16.16b, v0.16b, v2.16b
|
||
|
zip2 v18.16b, v0.16b, v2.16b
|
||
|
zip1 v17.16b, v1.16b, v3.16b
|
||
|
zip2 v19.16b, v1.16b, v3.16b
|
||
|
zip1 v0.16b, v16.16b, v17.16b
|
||
|
zip2 v1.16b, v16.16b, v17.16b
|
||
|
zip1 v2.16b, v18.16b, v19.16b
|
||
|
zip2 v3.16b, v18.16b, v19.16b
|
||
|
|
||
|
/* Luckily v4-v7 don't need to be unzipped because the complete set of
|
||
|
* four and can be stored using st4. */
|
||
|
|
||
|
tbz x2, #4, 1f
|
||
|
st4 {v4.16b - v7.16b}, [x0], #64
|
||
|
1: tbz x2, #3, 1f
|
||
|
st1 {v2.16b,v3.16b}, [x0], #32
|
||
|
1: tbz x2, #2, 1f
|
||
|
st1 {v1.16b}, [x0], #16
|
||
|
1: tbz x2, #1, 1f
|
||
|
st1 {v0.d}[1], [x0], #8
|
||
|
1: tbz x2, #0, 2f
|
||
|
st1 {v0.s}[1], [x0], #4
|
||
|
2:
|
||
|
.endm
|
||
|
|
||
|
|
||
|
/* void rsdIntrinsicYuv2_K(
|
||
|
* void *out, // x0
|
||
|
* void const *yin, // x1
|
||
|
* void const *uin, // x2
|
||
|
* void const *vin, // x3
|
||
|
* size_t xstart, // x4
|
||
|
* size_t xend); // x5
|
||
|
*/
|
||
|
ENTRY(rsdIntrinsicYuv2_K)
|
||
|
lsr x6, x4, #1
|
||
|
add x0, x0, x4, LSL #2
|
||
|
add x1, x1, x4
|
||
|
add x4, x3, x6
|
||
|
add x3, x2, x6
|
||
|
sub x2, x5, x6, LSL #1
|
||
|
|
||
|
sub x6, sp, #32
|
||
|
sub sp, sp, #64
|
||
|
st1 {v8.1d - v11.1d}, [sp]
|
||
|
st1 {v12.1d - v15.1d}, [x6]
|
||
|
|
||
|
wrap_line yuvkern, 0
|
||
|
|
||
|
ld1 {v8.1d - v11.1d}, [sp], #32
|
||
|
ld1 {v12.1d - v15.1d}, [sp], #32
|
||
|
ret
|
||
|
END(rsdIntrinsicYuv2_K)
|
||
|
|
||
|
/* void rsdIntrinsicYuv_K(
|
||
|
* void *out, // x0
|
||
|
* void const *yin, // x1
|
||
|
* void const *uvin, // x2
|
||
|
* size_t xstart, // x3
|
||
|
* size_t xend); // x4
|
||
|
*/
|
||
|
ENTRY(rsdIntrinsicYuv_K)
|
||
|
bic x5, x3, #1
|
||
|
add x0, x0, x5, LSL #2
|
||
|
add x1, x1, x5
|
||
|
add x3, x2, x5
|
||
|
sub x2, x4, x5
|
||
|
|
||
|
sub x5, sp, #32
|
||
|
sub sp, sp, #64
|
||
|
st1 {v8.1d - v11.1d}, [sp]
|
||
|
st1 {v12.1d - v15.1d}, [x5]
|
||
|
|
||
|
wrap_line yuvkern, 1, 1
|
||
|
|
||
|
ld1 {v8.1d - v11.1d}, [sp], #32
|
||
|
ld1 {v12.1d - v15.1d}, [sp], #32
|
||
|
ret
|
||
|
END(rsdIntrinsicYuv_K)
|
||
|
|
||
|
/* void rsdIntrinsicYuvR_K(
|
||
|
* void *out, // x0
|
||
|
* void const *yin, // x1
|
||
|
* void const *uvin, // x2
|
||
|
* size_t xstart, // x3
|
||
|
* size_t xend); // x4
|
||
|
*/
|
||
|
ENTRY(rsdIntrinsicYuvR_K)
|
||
|
bic x5, x3, #1
|
||
|
add x0, x0, x5, LSL #2
|
||
|
add x1, x1, x5
|
||
|
add x3, x2, x5
|
||
|
sub x2, x4, x5
|
||
|
|
||
|
sub x5, sp, #32
|
||
|
sub sp, sp, #64
|
||
|
st1 {v8.1d - v11.1d}, [sp]
|
||
|
st1 {v12.1d - v15.1d}, [x5]
|
||
|
|
||
|
wrap_line yuvkern, 1
|
||
|
|
||
|
ld1 {v8.1d - v11.1d}, [sp], #32
|
||
|
ld1 {v12.1d - v15.1d}, [sp], #32
|
||
|
ret
|
||
|
END(rsdIntrinsicYuvR_K)
|