You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

636 lines
17 KiB

///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
///*******************************************************************************
//* //file
//* ihevc_deblk_luma_vert.s
//*
//* //brief
//* contains function definitions for inter prediction interpolation.
//* functions are coded using neon intrinsics and can be compiled using
//* rvct
//*
//* //author
//* anand s
//*
//* //par list of functions:
//*
//*
//* //remarks
//* none
//*
//*******************************************************************************/
.text
.align 4
.extern gai4_ihevc_tc_table
.extern gai4_ihevc_beta_table
.globl ihevc_deblk_luma_vert_av8
.type ihevc_deblk_luma_vert_av8, %function
ihevc_deblk_luma_vert_av8:
sxtw x5,w5
sxtw x6,w6
stp d8,d9,[sp,#-16]!
stp d10,d11,[sp,#-16]!
stp d12,d13,[sp,#-16]!
stp d14,d15,[sp,#-16]!
stp x19, x20,[sp,#-16]!
stp x21, x22,[sp,#-16]!
mov x21,x7
ldr w22,[sp,#96]
add x3,x3,x4
add x3,x3,#1
asr x3,x3,#1
add x7,x3,x5,lsl #1
add x3,x3,x6,lsl #1
cmp x7,#0x33
mov x20,#0x33
csel x7, x20, x7,gt
bgt l1.56
cmp x7,#0x0
mov x20,#0x0
csel x7, x20, x7,lt // x7 has the beta_index value
l1.56:
// bic x2,x2,#1
asr x2,x2,#1
add x3,x3,x2,lsl #1
cmp x3,#0x35
mov x20,#0x35
csel x3, x20, x3,gt
bgt l1.88
cmp x3,#0x0
mov x20,#0x0
csel x3, x20, x3,lt // x3 has the tc_index value
// qp_luma = (quant_param_p + quant_param_q + 1) >> 1//
// beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)//
// tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)//
l1.88:
adrp x2, :got:gai4_ihevc_beta_table
ldr x2, [x2, #:got_lo12:gai4_ihevc_beta_table]
movi v18.8b, #0x2
adrp x4, :got:gai4_ihevc_tc_table
ldr x4, [x4, #:got_lo12:gai4_ihevc_tc_table]
ldr w5,[x2,x7,lsl #2] // beta
movi v16.8h, #0x2
ldr w6,[x4,x3,lsl #2] // tc
lsl x8,x6,#1
cmp x6,#0
dup v19.8b,w8
sub x7,x0,#4
movi v23.8b, #0x3
beq l1.964
sub x19,x0,#3
ld1 {v15.8b},[x7],x1
ldrb w8,[x19] // -3 value
ld1 {v1.8b},[x7],x1
ldrb w10,[x19,#1] //-2 value
ld1 {v29.8b},[x7],x1
ldrb w11,[x19,#2] //-1 value
ld1 {v0.8b},[x7]
ldrb w12,[x0,#0] // 0 value
ldrb w9,[x0,#1] // 1 value
trn1 v24.8b,v15.8b,v1.8b
trn2 v1.8b,v15.8b,v1.8b
ldrb w2,[x0,#2] // 2 value
trn1 v2.8b,v29.8b,v0.8b
trn2 v0.8b,v29.8b,v0.8b
add x12,x12,x2
subs x9,x12,x9,lsl #1 // dq0 value is stored in x9
csneg x9,x9,x9,pl
//dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )//
mov v29.8b,v24.8b
trn1 v24.4h,v29.4h,v2.4h
trn2 v2.4h,v29.4h,v2.4h
add x8,x8,x11
mov v15.8b,v1.8b
trn1 v1.4h,v15.4h,v0.4h
trn2 v0.4h,v15.4h,v0.4h
subs x8,x8,x10,lsl #1
csneg x8,x8,x8,pl
// dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )//
add x14,x1,x1,lsl #1
add x14,x0,x14
sub x19,x14,#3
dup v4.2s, v24.s[1]
ldrb w2,[x19] // -2 value
dup v7.2s, v2.s[1]
ldrb w10,[x19,#1] // -2 value
dup v3.2s, v2.s[0]
ldrb w11,[x19,#2] // -1 value
dup v5.2s, v1.s[1]
ldrb w12,[x14,#0] // 0 value
dup v6.2s, v1.s[0]
ldrb w3,[x14,#1] // 1 value
dup v2.2s, v0.s[0]
ldrb w4,[x14,#2] // 2 value
add x12,x12,x4
subs x12,x12,x3,lsl #1 // dq3value is stored in x12
csneg x12,x12,x12,pl
// dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )//
add x2,x2,x11
subs x11,x2,x10,lsl #1
csneg x11,x11,x11,pl // dp3 value is stored in x8
// dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )//
add x3,x8,x9 // x3 has the d0 value
add x4,x11,x12 // x4 has the d3 value
// d0 = dp0 + dq0//
// d3 = dp3 + dq3//
add x14,x8,x11 // x13 has the value dp
add x12,x12,x9 // x12 has the value dq
// dp = dp0 + dp3//
// dq = dq0 + dq3//
add x11, x3, x4 // x3 has the value d
// d = d0 + d3//
cmp x11,x5
dup v22.2s, v0.s[1]
bge l1.964
// if(d < beta)
// registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11
// registers for use: x2,x7,x8,x9,x10,
uqsub v30.8b,v7.8b,v19.8b
asr x10,x5,#2
uqadd v31.8b,v7.8b,v19.8b
cmp x10,x3,lsl #1
uaddl v0.8h,v5.8b,v4.8b
ble l1.336
sub x19,x0,4
ldrb w2,[x19]
uaddw v0.8h, v0.8h , v2.8b
ldrb w7,[x19,#3]
umull v20.8h, v7.8b, v23.8b
ldrb w3,[x0,#0]
umlal v20.8h, v22.8b, v18.8b
ldrb w8,[x0,#3]
// ubfx x7,x2,#24,#8 // has the -1 value
// and x2,#0xff // has the -4 value
// ubfx x8,x3,#24,#8 // has the 3 value
// and x3,#0xff // x4 has the 0 value
add v20.8h, v20.8h , v0.8h
subs x8,x8,x3
rshrn v22.8b,v20.8h,#3
csneg x8,x8,x8,pl
subs x2,x2,x7
umin v21.8b, v22.8b , v31.8b
csneg x2,x2,x2,pl
umax v22.8b, v21.8b , v30.8b
add x8,x8,x2
uaddl v20.8h,v7.8b,v3.8b
cmp x8,x5,asr #3
mla v20.8h, v0.8h, v16.8h
bge l1.336
uaddw v0.8h, v0.8h , v7.8b
subs x7,x3,x7
rshrn v20.8b,v20.8h,#3
csneg x7,x7,x7,pl
rshrn v0.8b,v0.8h,#2
mov x10,#5
uqadd v30.8b,v5.8b,v19.8b
mul x10, x10, x6
uqsub v31.8b,v5.8b,v19.8b
add x10, x10,#1
cmp x7,x10,asr #1
bge l1.336
// if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) )
// && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
asr x10,x5,#2
uqsub v25.8b,v4.8b,v19.8b
cmp x10,x4,lsl #1
uqadd v21.8b,v4.8b,v19.8b
ble l1.336
umin v26.8b, v20.8b , v21.8b
add x4,x1,x1,lsl #1
add x4,x4,x0
umax v20.8b, v26.8b , v25.8b
sub x19,x4,#4
ldrb w2,[x19]
umin v19.8b, v0.8b , v30.8b
ldrb w7,[x19,#3]
umax v21.8b, v19.8b , v31.8b
ldrb w3,[x4,#0]
lsl x10,x6,#1
ldrb w8,[x4,#3]
// ubfx x7,x2,#24,#8 // has the -1 value
// and x2,#0xff // has the -4 value
// ubfx x8,x3,#24,#8 // has the 3 value
// and x3,#0xff // x4 has the 0 value
uaddl v0.8h,v2.8b,v3.8b
dup v19.8b,w10
subs x8,x8,x3
uaddw v0.8h, v0.8h , v4.8b
csneg x8,x8,x8,pl
uqadd v30.8b,v2.8b,v19.8b
subs x2,x2,x7
uqsub v31.8b,v2.8b,v19.8b
csneg x2,x2,x2,pl
uaddl v26.8h,v5.8b,v6.8b
add x8,x8,x2
mla v26.8h, v0.8h, v16.8h
cmp x8,x5,asr #3
bge l1.336
rshrn v26.8b,v26.8h,#3
subs x7,x3,x7
uqadd v27.8b,v3.8b,v19.8b
csneg x7,x7,x7,pl
uqsub v28.8b,v3.8b,v19.8b
mov x10,#5
umin v16.8b, v26.8b , v30.8b
mul x10, x10, x6
add x10, x10,#1
cmp x7,x10,asr #1
umax v26.8b, v16.8b , v31.8b
bge l1.336
uqadd v30.8b,v6.8b,v19.8b
mov x2,#2
mov x4,x21
uqsub v31.8b,v6.8b,v19.8b
mov x5,x22
b end_dep_deq_decision
// x2 has the value of de
// x6 has teh value of tc
// x5 has the value of beta
// x14 has the value of dp
// x12 has the value of dq
// x0 has the value of source address
// x1 has the src stride
l1.336:
mov x2,#1
l1.424:
mov x11,x5
mov x4,x21
mov x5,x22
cmp x6,#1
mov x20,#0
csel x9, x20, x9,eq
mov x20,#0
csel x10, x20, x10,eq
beq end_dep_deq_decision
and x7,x4,x5
cmp x7,#1
beq both_flags_set
cmp x4,#0
beq set_flag_dep_zero
add x8,x11,x11,asr #1
mov x10,#0
asr x8,x8,#3
cmp x8,x14
mov x20,#1
csel x9, x20, x9,gt
mov x20,#0
csel x9, x20, x9,le
b end_dep_deq_decision
set_flag_dep_zero:
add x8,x11,x11,asr #1
mov x9,#0
asr x8,x8,#3
cmp x8,x12
mov x20,#1
csel x10, x20, x10,gt
mov x20,#0
csel x10, x20, x10,le
b end_dep_deq_decision
both_flags_set:
add x8,x11,x11,asr #1
asr x8,x8,#3
cmp x8,x14
mov x20,#1
csel x9, x20, x9,gt
mov x20,#0
csel x9, x20, x9,le
cmp x8,x12
mov x20,#1
csel x10, x20, x10,gt
mov x20,#0
csel x10, x20, x10,le
end_dep_deq_decision:
//x0=source address
//x1=stride
// x2 =de
// x4=flag p
//x5= flag q
//x6 =tc
// x9 =dep
// x10=deq
// b l1.964
cmp x2,#2
// x4 has the value of de
bne l1.968
cmp x5,#0
beq l1.780
// x5 has the flag of q
add x3,x0,#2
st1 {v22.b}[0],[x3],x1
st1 {v22.b}[1],[x3],x1
st1 {v22.b}[2],[x3],x1
st1 {v22.b}[3],[x3]
add x3,x0,x1
mov v29.8b,v20.8b
trn1 v20.8b,v29.8b,v21.8b
trn2 v21.8b,v29.8b,v21.8b
st1 {v20.h}[0],[x0]
st1 {v21.h}[0],[x3],x1
st1 {v20.h}[1],[x3],x1
st1 {v21.h}[1],[x3]
l1.780:
cmp x4,#0
beq l1.964
// x4 has the flag p
dup v7.2s, v24.s[0]
sub x3,x0,#1
uaddw v16.8h, v0.8h , v6.8b
add x7,x3,x1
rshrn v2.8b,v16.8h,#2
st1 {v26.b}[0],[x3]
sub x0,x0,#3
umin v16.8b, v2.8b , v27.8b
st1 {v26.b}[1],[x7],x1
umull v2.8h, v6.8b, v23.8b
umlal v2.8h, v7.8b, v18.8b
st1 {v26.b}[2],[x7],x1
umax v5.8b, v16.8b , v28.8b
st1 {v26.b}[3],[x7]
add v0.8h, v2.8h , v0.8h
rshrn v0.8b,v0.8h,#3
umin v1.8b, v0.8b , v30.8b
umax v0.8b, v1.8b , v31.8b
mov v29.8b,v0.8b
trn1 v0.8b,v29.8b,v5.8b
trn2 v5.8b,v29.8b,v5.8b
st1 {v0.h}[0],[x0],x1
st1 {v5.h}[0],[x0],x1
st1 {v0.h}[1],[x0],x1
st1 {v5.h}[1],[x0]
l1.964:
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
ldp d14,d15,[sp],#16
ldp d12,d13,[sp],#16
ldp d10,d11,[sp],#16
ldp d8,d9,[sp],#16
ret
l1.968:
movi v0.8h, #0x9
neg x11, x6
cmp x4,#0
// checks for the flag p
movi v16.8h, #0x3
movi v24.8b, #0x1
dup v30.8b,w11
and x11,x6,#0xff
dup v31.8b,w11
usubl v18.8h,v4.8b,v2.8b
mul v18.8h, v18.8h, v0.8h
usubl v0.8h,v5.8b,v3.8b
mul v16.8h, v0.8h, v16.8h
sub v16.8h, v18.8h , v16.8h
srshr v16.8h,v16.8h,#4
// delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4//
abs v0.8h, v16.8h
xtn v0.8b, v0.8h
// storing the absolute values of delta in d0
sqxtn v16.8b,v16.8h
// storing the clipped values of delta in d16
movi v1.8b, #0xa
dup v21.8b,w11
mul v1.8b, v1.8b, v21.8b
// d1 stores the value (10 * tc)
//if(abs(delta) < 10 * tc)
smin v18.8b, v16.8b , v31.8b
smax v20.8b, v18.8b , v30.8b
// delta = clip3(delta, -tc, tc)//
sxtl v16.8h, v20.8b
uxtl v18.8h, v2.8b
add v18.8h, v18.8h , v16.8h
sqxtun v22.8b, v18.8h
uxtl v18.8h, v4.8b
sub v16.8h, v18.8h , v16.8h
sqxtun v23.8b, v16.8h
// tmp_p0 = clip_u8(pu1_src[-1] + delta)//
// tmp_q0 = clip_u8(pu1_src[0] - delta)//
beq l1.1272
cmp x9,#1
bne l1.1212
// checks for the flag dep
asr x3,x6,#1
uaddl v16.8h,v6.8b,v2.8b
uaddw v16.8h, v16.8h , v24.8b
dup v18.8b,w3
sub x20,x3,#0
neg x3, x20
dup v19.8b,w3
ushr v16.8h,v16.8h,#1
xtn v16.8b, v16.8h
usubl v16.8h,v16.8b,v3.8b
saddw v16.8h, v16.8h , v20.8b
sshr v16.8h,v16.8h,#1
sqxtn v16.8b,v16.8h
smin v17.8b, v16.8b , v18.8b
smax v16.8b, v19.8b , v17.8b
uxtl v18.8h, v3.8b
sxtl v16.8h, v16.8b
add v16.8h, v18.8h , v16.8h
sqxtun v16.8b, v16.8h
mov v30.8b,v3.8b
cmhs v3.8b,v0.8b,v1.8b
bsl v3.8b,v30.8b,v16.8b
l1.1212:
dup v16.8b,w11
sub x12,x0,#3
sub x3,x0,#1
// smul v16.8b, v16.8b, v1.8b
mov v29.8b,v6.8b
trn1 v6.8b,v29.8b,v3.8b
trn2 v3.8b,v29.8b,v3.8b
st1 {v6.h}[0],[x12],x1
cmhs v16.8b,v0.8b,v1.8b
st1 {v3.h}[0],[x12],x1
bsl v16.8b,v2.8b,v22.8b
st1 {v16.b}[0],[x3],x1
st1 {v16.b}[1],[x3],x1
st1 {v6.h}[1],[x12],x1
st1 {v16.b}[2],[x3],x1
st1 {v3.h}[1],[x12]
st1 {v16.b}[3],[x3]
l1.1272:
cmp x5,#0
beq l1.964
// checks for the flag q
cmp x10,#1
bne l1.1412
// checks for the flag deq
mov v2.8b,v7.8b
asr x3,x6,#1
dup v6.8b,w3
sub x20,x3,#0
neg x3, x20
dup v16.8b,w3
uaddl v2.8h,v2.8b,v4.8b
uaddw v2.8h, v2.8h , v24.8b
ushr v2.8h,v2.8h,#1
xtn v2.8b, v2.8h
usubl v2.8h,v2.8b,v5.8b
ssubw v2.8h, v2.8h , v20.8b
sshr v2.8h,v2.8h,#1
sqxtn v3.8b,v2.8h
smin v2.8b, v3.8b , v6.8b
smax v3.8b, v16.8b , v2.8b
// dup v6.8b,w2
// smul v6.8b, v6.8b, v1.8b
uxtl v16.8h, v5.8b
sxtl v2.8h, v3.8b
add v2.8h, v16.8h , v2.8h
sqxtun v3.8b, v2.8h
mov v30.8b,v5.8b
cmhs v5.8b,v0.8b,v1.8b
bsl v5.8b,v30.8b,v3.8b
l1.1412:
// dup v2.8b,w2
add x3,x0,#2
add x11,x3,x1
// smul v1.8b, v2.8b, v1.8b
st1 {v7.b}[0],[x3]
st1 {v7.b}[1],[x11],x1
st1 {v7.b}[2],[x11],x1
cmhs v0.8b,v0.8b,v1.8b
st1 {v7.b}[3],[x11]
bsl v0.8b,v4.8b,v23.8b
mov v29.8b,v0.8b
trn1 v0.8b,v29.8b,v5.8b
trn2 v5.8b,v29.8b,v5.8b
st1 {v0.h}[0],[x0],x1
st1 {v5.h}[0],[x0],x1
st1 {v0.h}[1],[x0],x1
st1 {v5.h}[1],[x0]
ldp x21, x22,[sp],#16
ldp x19, x20,[sp],#16
ldp d14,d15,[sp],#16
ldp d12,d13,[sp],#16
ldp d10,d11,[sp],#16
ldp d8,d9,[sp],#16
ret