You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
266 lines
8.3 KiB
266 lines
8.3 KiB
/*
|
|
* Copyright (C) 2012,2014 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/*
|
|
x0 = dst
|
|
x1 = y0 base pointer
|
|
x2 = y1 base pointer
|
|
x3 = y2 base pointer
|
|
x4 = coeffs
|
|
x5 = length / 2
|
|
*/
|
|
|
|
#define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f:
|
|
#define END(f) .size f, .-f;
|
|
|
|
ENTRY(rsdIntrinsicConvolve3x3_K)
|
|
sub x6, sp, #64
|
|
sub sp, sp, #64
|
|
st1 {v8.1d-v11.1d}, [x6], #32
|
|
st1 {v12.1d-v15.1d}, [x6]
|
|
|
|
/* Load the coefficients in the v0, v1 registers */
|
|
ld1 {v0.8h, v1.8h}, [x4]
|
|
|
|
/* Load the frequently used immediate in a register */
|
|
mov x4, #8
|
|
|
|
1:
|
|
/* Load and post-increase the address by x4=#8 */
|
|
ld1 {v13.16b}, [x1], x4
|
|
ld1 {v14.16b}, [x2], x4
|
|
ld1 {v15.16b}, [x3], x4
|
|
|
|
/* Signal memory for data that will be used in the loop after the next */
|
|
// prfm PLDL1KEEP,[x1, x4] // TODO: test this
|
|
// prfm PLDL1KEEP,[x2, x4] // TODO: test this
|
|
// prfm PLDL1KEEP,[x3, x4] // TODO: test this
|
|
|
|
uxtl v2.8h, v13.8b
|
|
uxtl2 v3.8h, v13.16b
|
|
uxtl v4.8h, v14.8b
|
|
uxtl2 v5.8h, v14.16b
|
|
uxtl v6.8h, v15.8b
|
|
uxtl2 v7.8h, v15.16b
|
|
|
|
/*
|
|
The two pixel source array is
|
|
v2, v2hi, v3lo, v3hi
|
|
v4, v4hi, v5lo, v5hi
|
|
v6, v6hi, v7lo, v7hi
|
|
*/
|
|
|
|
smull v8.4s, v2.4h, v0.h[0]
|
|
smull2 v9.4s, v2.8h, v0.h[0]
|
|
smlal2 v8.4s, v2.8h, v0.h[1]
|
|
smlal v9.4s, v3.4h, v0.h[1]
|
|
smlal v8.4s, v3.4h, v0.h[2]
|
|
smlal2 v9.4s, v3.8h, v0.h[2]
|
|
smlal v8.4s, v4.4h, v0.h[3]
|
|
smlal2 v9.4s, v4.8h, v0.h[3]
|
|
smlal2 v8.4s, v4.8h, v0.h[4]
|
|
smlal v9.4s, v5.4h, v0.h[4]
|
|
smlal v8.4s, v5.4h, v0.h[5]
|
|
smlal2 v9.4s, v5.8h, v0.h[5]
|
|
smlal v8.4s, v6.4h, v0.h[6]
|
|
smlal2 v9.4s, v6.8h, v0.h[6]
|
|
smlal2 v8.4s, v6.8h, v0.h[7]
|
|
smlal v9.4s, v7.4h, v0.h[7]
|
|
smlal v8.4s, v7.4h, v1.h[0]
|
|
smlal2 v9.4s, v7.8h, v1.h[0]
|
|
|
|
shrn v8.4h, v8.4s, #8
|
|
shrn2 v8.8h, v9.4s, #8
|
|
|
|
sqxtun v8.8b, v8.8h
|
|
st1 {v8.8b}, [x0], #8
|
|
|
|
/* Are we done yet? */
|
|
subs x5, x5, #1
|
|
bne 1b
|
|
|
|
/* We're done, bye! */
|
|
ld1 {v8.1d-v11.1d}, [sp], #32
|
|
ld1 {v12.1d-v15.1d}, [sp], #32
|
|
ret
|
|
END(rsdIntrinsicConvolve3x3_K)
|
|
|
|
|
|
/* Convolve 5x5 */
|
|
|
|
/*
|
|
x0 = dst
|
|
x1 = y0 base pointer
|
|
x2 = y1 base pointer
|
|
x3 = y2 base pointer
|
|
x4 = y3 base pointer
|
|
x5 = y4 base pointer
|
|
x6 = coeffs
|
|
x7 = length
|
|
*/
|
|
ENTRY(rsdIntrinsicConvolve5x5_K)
|
|
sub x8, sp, #64
|
|
sub sp, sp, #64
|
|
st1 {v8.1d-v11.1d}, [x8], #32
|
|
st1 {v12.1d-v15.1d}, [x8]
|
|
|
|
/* Create the coefficients vector */
|
|
ld1 {v0.8h-v2.8h}, [x6], #48
|
|
ld1 {v3.4h}, [x6], #8
|
|
|
|
movi v15.4s, #0x7f
|
|
|
|
/* Load the frequently used immediate in a register */
|
|
mov x6, #8
|
|
|
|
1:
|
|
/* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
|
|
ld1 {v9.8b-v11.8b}, [x1], x6 // y0 ( y - 2 )
|
|
ld1 {v12.8b-v14.8b}, [x2], x6 // y0 ( y - 1 )
|
|
|
|
/* Signal memory for data that will be used in the loop after the next */
|
|
// prfm PLDL1KEEP,[x1, x6] // TODO: test this
|
|
// prfm PLDL1KEEP,[x2, x6] // TODO: test this
|
|
|
|
/* Promoting the 8bit channels to 16bit */
|
|
uxtl v9.8h, v9.8b
|
|
uxtl v10.8h, v10.8b
|
|
uxtl v11.8h, v11.8b
|
|
uxtl v12.8h, v12.8b
|
|
uxtl v13.8h, v13.8b
|
|
uxtl v14.8h, v14.8b
|
|
|
|
/*
|
|
v9, v9hi, v10lo, v10hi, v11lo, v11hi,
|
|
v12, v12hi
|
|
*/
|
|
smull v4.4s, v9.4h, v0.h[0]
|
|
smull2 v5.4s, v9.8h, v0.h[0]
|
|
smlal2 v4.4s, v9.8h, v0.h[1]
|
|
smlal v5.4s, v10.4h, v0.h[1]
|
|
smlal v4.4s, v10.4h, v0.h[2]
|
|
smlal2 v5.4s, v10.8h, v0.h[2]
|
|
smlal2 v4.4s, v10.8h, v0.h[3]
|
|
smlal v5.4s, v11.4h, v0.h[3]
|
|
smlal v4.4s, v11.4h, v0.h[4]
|
|
smlal2 v5.4s, v11.8h, v0.h[4]
|
|
|
|
smlal v4.4s, v12.4h, v0.h[5]
|
|
smlal2 v5.4s, v12.8h, v0.h[5]
|
|
smlal2 v4.4s, v12.8h, v0.h[6]
|
|
smlal v5.4s, v13.4h, v0.h[6]
|
|
smlal v4.4s, v13.4h, v0.h[7]
|
|
smlal2 v5.4s, v13.8h, v0.h[7]
|
|
smlal2 v4.4s, v13.8h, v1.h[0]
|
|
smlal v5.4s, v14.4h, v1.h[0]
|
|
smlal v4.4s, v14.4h, v1.h[1]
|
|
smlal2 v5.4s, v14.8h, v1.h[1]
|
|
|
|
/* Next 2 rows */
|
|
/* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
|
|
ld1 {v9.8b-v11.8b}, [x3], x6 // y0 ( y )
|
|
ld1 {v12.8b-v14.8b}, [x4], x6 // y0 ( y + 1 )
|
|
|
|
/* Signal memory for data that will be used in the loop after the next */
|
|
// prfm PLDL1KEEP,[x3, x6] // TODO: test this
|
|
// prfm PLDL1KEEP,[x4, x6] // TODO: test this
|
|
|
|
/* Promoting the 8bit channels to 16bit */
|
|
uxtl v9.8h, v9.8b
|
|
uxtl v10.8h, v10.8b
|
|
uxtl v11.8h, v11.8b
|
|
uxtl v12.8h, v12.8b
|
|
uxtl v13.8h, v13.8b
|
|
uxtl v14.8h, v14.8b
|
|
|
|
/*
|
|
v9, v9hi, v10lo, v10hi, v11lo, v11hi,
|
|
v12, v12hi
|
|
*/
|
|
smlal v4.4s, v9.4h, v1.h[2]
|
|
smlal2 v5.4s, v9.8h, v1.h[2]
|
|
smlal2 v4.4s, v9.8h, v1.h[3]
|
|
smlal v5.4s, v10.4h, v1.h[3]
|
|
smlal v4.4s, v10.4h, v1.h[4]
|
|
smlal2 v5.4s, v10.8h, v1.h[4]
|
|
smlal2 v4.4s, v10.8h, v1.h[5]
|
|
smlal v5.4s, v11.4h, v1.h[5]
|
|
smlal v4.4s, v11.4h, v1.h[6]
|
|
smlal2 v5.4s, v11.8h, v1.h[6]
|
|
|
|
smlal v4.4s, v12.4h, v1.h[7]
|
|
smlal2 v5.4s, v12.8h, v1.h[7]
|
|
smlal2 v4.4s, v12.8h, v2.h[0]
|
|
smlal v5.4s, v13.4h, v2.h[0]
|
|
smlal v4.4s, v13.4h, v2.h[1]
|
|
smlal2 v5.4s, v13.8h, v2.h[1]
|
|
smlal2 v4.4s, v13.8h, v2.h[2]
|
|
smlal v5.4s, v14.4h, v2.h[2]
|
|
smlal v4.4s, v14.4h, v2.h[3]
|
|
smlal2 v5.4s, v14.8h, v2.h[3]
|
|
|
|
/* Last row */
|
|
/* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
|
|
ld1 {v9.8b- v11.8b}, [x5], x6 // y0 ( y + 2 )
|
|
|
|
/* Signal memory for data that will be used in the loop after the next */
|
|
// prfm PLDL1KEEP,[x5, x6] // TODO: test this
|
|
|
|
/* Promoting the 8bit channels to 16bit */
|
|
uxtl v9.8h, v9.8b
|
|
uxtl v10.8h, v10.8b
|
|
uxtl v11.8h, v11.8b
|
|
|
|
/*
|
|
v9, v9hi, v10lo, v10hi, v11lo, v11hi,
|
|
v12, v12hi
|
|
*/
|
|
|
|
smlal v4.4s, v9.4h, v2.h[4]
|
|
smlal2 v5.4s, v9.8h, v2.h[4]
|
|
smlal2 v4.4s, v9.8h, v2.h[5]
|
|
smlal v5.4s, v10.4h, v2.h[5]
|
|
smlal v4.4s, v10.4h, v2.h[6]
|
|
smlal2 v5.4s, v10.8h, v2.h[6]
|
|
smlal2 v4.4s, v10.8h, v2.h[7]
|
|
smlal v5.4s, v11.4h, v2.h[7]
|
|
smlal v4.4s, v11.4h, v3.h[0]
|
|
smlal2 v5.4s, v11.8h, v3.h[0]
|
|
|
|
add v4.4s, v4.4s, v15.4s
|
|
add v5.4s, v5.4s, v15.4s
|
|
|
|
/* Narrow it to a d-reg 32 -> 16 bit */
|
|
rshrn v4.4h, v4.4s, #8
|
|
rshrn2 v4.8h, v5.4s, #8
|
|
|
|
|
|
/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */
|
|
sqxtun v4.8b, v4.8h
|
|
|
|
st1 {v4.8b}, [x0], #8 // return the output and increase the address of x0
|
|
|
|
/* Are we done? */
|
|
subs x7, x7, #1
|
|
bne 1b
|
|
|
|
/* Yup, bye */
|
|
ld1 {v8.1d-v11.1d}, [sp], #32
|
|
ld1 {v12.1d-v15.1d}, [sp], #32
|
|
ret
|
|
|
|
END(rsdIntrinsicConvolve5x5_K)
|