You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
524 lines
14 KiB
524 lines
14 KiB
///*****************************************************************************
|
|
//*
|
|
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************/
|
|
///**
|
|
// *******************************************************************************
|
|
// * //file
|
|
// * ihevc_padding_neon.s
|
|
// *
|
|
// * //brief
|
|
// * contains function definitions padding
|
|
// *
|
|
// * //author
|
|
// * naveen sr
|
|
// *
|
|
// * //par list of functions:
|
|
// * - ihevc_pad_left_luma()
|
|
// * - ihevc_pad_left_chroma()
|
|
// *
|
|
// * //remarks
|
|
// * none
|
|
// *
|
|
// *******************************************************************************
|
|
//*/
|
|
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* //brief
|
|
//* padding (luma block) at the left of a 2d array
|
|
//*
|
|
//* //par description:
|
|
//* the left column of a 2d array is replicated for pad_size times at the left
|
|
//*
|
|
//*
|
|
//* //param[in] pu1_src
|
|
//* uword8 pointer to the source
|
|
//*
|
|
//* //param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* //param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* //param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* //param[in] pad_size
|
|
//* integer -padding size of the array
|
|
//*
|
|
//* //param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* //param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* //returns
|
|
//*
|
|
//* //remarks
|
|
//* none
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//.if pad_left_luma == c
|
|
//void ihevc_pad_left_luma(uword8 *pu1_src,
|
|
// word32 src_strd,
|
|
// word32 ht,
|
|
// word32 pad_size)
|
|
//**************variables vs registers*************************
|
|
// x0 => *pu1_src
|
|
// x1 => src_strd
|
|
// x2 => ht
|
|
// x3 => pad_size
|
|
|
|
.text
|
|
.align 4
|
|
|
|
.globl ihevc_pad_left_luma_av8
|
|
|
|
.type ihevc_pad_left_luma_av8, %function
|
|
|
|
ihevc_pad_left_luma_av8:
|
|
|
|
loop_start_luma_left:
|
|
// pad size is assumed to be pad_left = 80
|
|
sub x4,x0,x3
|
|
|
|
ldrb w8,[x0]
|
|
add x0,x0,x1
|
|
ldrb w9,[x0]
|
|
add x0,x0,x1
|
|
ldrb w10,[x0]
|
|
add x0,x0,x1
|
|
ldrb w11,[x0]
|
|
add x0,x0,x1
|
|
|
|
dup v0.16b,w8
|
|
dup v2.16b,w9
|
|
dup v4.16b,w10
|
|
dup v6.16b,w11
|
|
|
|
add x5,x4,x1
|
|
|
|
st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store
|
|
st1 {v0.16b},[x4],#16 // 16 bytes store
|
|
st1 {v0.16b},[x4],#16 // 16 bytes store
|
|
st1 {v0.16b},[x4],#16 // 16 bytes store
|
|
st1 {v0.16b},[x4] // 16 bytes store
|
|
|
|
add x6,x5,x1
|
|
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5] //128/8 = 16 bytes store
|
|
|
|
add x7,x6,x1
|
|
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6] //128/8 = 16 bytes store
|
|
|
|
subs x2, x2,#4
|
|
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7] //128/8 = 16 bytes store
|
|
|
|
// total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
|
|
|
|
bne loop_start_luma_left
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* //brief
|
|
//* padding (chroma block) at the left of a 2d array
|
|
//*
|
|
//* //par description:
|
|
//* the left column of a 2d array is replicated for pad_size times at the left
|
|
//*
|
|
//*
|
|
//* //param[in] pu1_src
|
|
//* uword8 pointer to the source
|
|
//*
|
|
//* //param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* //param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* //param[in] wd
|
|
//* integer width of the array (each colour component)
|
|
//*
|
|
//* //param[in] pad_size
|
|
//* integer -padding size of the array
|
|
//*
|
|
//* //param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* //param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* //returns
|
|
//*
|
|
//* //remarks
|
|
//* none
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//.if pad_left_chroma == c
|
|
//void ihevc_pad_left_chroma(uword8 *pu1_src,
|
|
// word32 src_strd,
|
|
// word32 ht,
|
|
// word32 pad_size)
|
|
//{
|
|
// x0 => *pu1_src
|
|
// x1 => src_strd
|
|
// x2 => ht
|
|
// x3 => pad_size
|
|
|
|
|
|
|
|
.globl ihevc_pad_left_chroma_av8
|
|
|
|
.type ihevc_pad_left_chroma_av8, %function
|
|
|
|
ihevc_pad_left_chroma_av8:
|
|
|
|
|
|
loop_start_chroma_left:
|
|
// pad size is assumed to be pad_left = 80
|
|
sub x4,x0,x3
|
|
|
|
ldrh w8,[x0]
|
|
add x0,x0,x1
|
|
ldrh w9,[x0]
|
|
add x0,x0,x1
|
|
ldrh w10,[x0]
|
|
add x0,x0,x1
|
|
ldrh w11,[x0]
|
|
add x0,x0,x1
|
|
|
|
dup v0.8h,w8
|
|
dup v2.8h,w9
|
|
dup v4.8h,w10
|
|
dup v6.8h,w11
|
|
|
|
add x5,x4,x1
|
|
|
|
st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store
|
|
st1 {v0.16b},[x4],#16 // 16 bytes store
|
|
st1 {v0.16b},[x4],#16 // 16 bytes store
|
|
st1 {v0.16b},[x4],#16 // 16 bytes store
|
|
st1 {v0.16b},[x4] // 16 bytes store
|
|
|
|
add x6,x5,x1
|
|
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5] //128/8 = 16 bytes store
|
|
|
|
add x7,x6,x1
|
|
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6] //128/8 = 16 bytes store
|
|
|
|
subs x2, x2,#4
|
|
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7] //128/8 = 16 bytes store
|
|
|
|
// total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
|
|
|
|
bne loop_start_chroma_left
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* //brief
|
|
//* padding (luma block) at the right of a 2d array
|
|
//*
|
|
//* //par description:
|
|
//* the right column of a 2d array is replicated for pad_size times at the right
|
|
//*
|
|
//*
|
|
//* //param[in] pu1_src
|
|
//* uword8 pointer to the source
|
|
//*
|
|
//* //param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* //param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* //param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* //param[in] pad_size
|
|
//* integer -padding size of the array
|
|
//*
|
|
//* //param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* //param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* //returns
|
|
//*
|
|
//* //remarks
|
|
//* none
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//.if pad_right_luma == c
|
|
//void ihevc_pad_right_luma(uword8 *pu1_src,
|
|
// word32 src_strd,
|
|
// word32 ht,
|
|
// word32 pad_size)
|
|
//{
|
|
// word32 row//
|
|
//
|
|
// for(row = 0// row < ht// row++)
|
|
// {
|
|
// memset(pu1_src, *(pu1_src -1), pad_size)//
|
|
//
|
|
// pu1_src += src_strd//
|
|
// }
|
|
//}
|
|
//
|
|
// x0 => *pu1_src
|
|
// x1 => src_strd
|
|
// x2 => ht
|
|
// x3 => pad_size
|
|
|
|
|
|
|
|
.globl ihevc_pad_right_luma_av8
|
|
|
|
.type ihevc_pad_right_luma_av8, %function
|
|
|
|
ihevc_pad_right_luma_av8:
|
|
|
|
|
|
loop_start_luma_right:
|
|
// pad size is assumed to be pad_left = 80
|
|
mov x4,x0
|
|
|
|
ldrb w8,[x0, #-1]
|
|
add x0,x0,x1
|
|
ldrb w9,[x0, #-1]
|
|
add x0,x0,x1
|
|
ldrb w10,[x0, #-1]
|
|
add x0,x0,x1
|
|
ldrb w11,[x0, #-1]
|
|
add x0,x0,x1
|
|
|
|
add x5,x4,x1
|
|
add x6,x5,x1
|
|
add x7,x6,x1
|
|
|
|
dup v0.16b,w8
|
|
dup v2.16b,w9
|
|
dup v4.16b,w10
|
|
dup v6.16b,w11
|
|
|
|
st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store
|
|
st1 {v0.16b},[x4],#16 // 16 bytes store
|
|
st1 {v0.16b},[x4],#16 // 16 bytes store
|
|
st1 {v0.16b},[x4],#16 // 16 bytes store
|
|
st1 {v0.16b},[x4] // 16 bytes store
|
|
|
|
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5] //128/8 = 16 bytes store
|
|
|
|
subs x2, x2,#4
|
|
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6] //128/8 = 16 bytes store
|
|
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7] //128/8 = 16 bytes store
|
|
|
|
|
|
// total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
|
|
|
|
|
|
bne loop_start_luma_right
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* //brief
|
|
////* padding (chroma block) at the right of a 2d array
|
|
//*
|
|
//* //par description:
|
|
//* the right column of a 2d array is replicated for pad_size times at the right
|
|
//*
|
|
//*
|
|
//* //param[in] pu1_src
|
|
////* uword8 pointer to the source
|
|
//*
|
|
//* //param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* //param[in] ht
|
|
////* integer height of the array
|
|
//*
|
|
//* //param[in] wd
|
|
//* integer width of the array (each colour component)
|
|
//*
|
|
//* //param[in] pad_size
|
|
//* integer -padding size of the array
|
|
//*
|
|
//* //param[in] ht
|
|
////* integer height of the array
|
|
//*
|
|
//* //param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* //returns
|
|
//*
|
|
//* //remarks
|
|
//* none
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//.if pad_right_chroma == c
|
|
//void ihevc_pad_right_chroma(uword8 *pu1_src,
|
|
// word32 src_strd,
|
|
// word32 ht,
|
|
// word32 pad_size)
|
|
// x0 => *pu1_src
|
|
// x1 => src_strd
|
|
// x2 => ht
|
|
// x3 => pad_size
|
|
|
|
|
|
|
|
.globl ihevc_pad_right_chroma_av8
|
|
|
|
.type ihevc_pad_right_chroma_av8, %function
|
|
|
|
ihevc_pad_right_chroma_av8:
|
|
|
|
|
|
loop_start_chroma_right:
|
|
// pad size is assumed to be pad_left = 80
|
|
mov x4,x0
|
|
|
|
ldrh w8,[x0, #-2]
|
|
add x0,x0,x1
|
|
ldrh w9,[x0, #-2]
|
|
add x0,x0,x1
|
|
ldrh w10,[x0, #-2]
|
|
add x0,x0,x1
|
|
ldrh w11,[x0, #-2]
|
|
add x0,x0,x1
|
|
|
|
dup v0.8h,w8
|
|
dup v2.8h,w9
|
|
dup v4.8h,w10
|
|
dup v6.8h,w11
|
|
|
|
add x5,x4,x1
|
|
|
|
st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store
|
|
st1 {v0.16b},[x4],#16 // 16 bytes store
|
|
st1 {v0.16b},[x4],#16 // 16 bytes store
|
|
st1 {v0.16b},[x4],#16 // 16 bytes store
|
|
st1 {v0.16b},[x4] // 16 bytes store
|
|
|
|
add x6,x5,x1
|
|
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store
|
|
st1 {v2.16b},[x5] //128/8 = 16 bytes store
|
|
|
|
add x7,x6,x1
|
|
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store
|
|
st1 {v4.16b},[x6] //128/8 = 16 bytes store
|
|
|
|
subs x2, x2,#4
|
|
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store
|
|
st1 {v6.16b},[x7] //128/8 = 16 bytes store
|
|
|
|
// total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
|
|
|
|
bne loop_start_chroma_right
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|