You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
808 lines
24 KiB
808 lines
24 KiB
@/******************************************************************************
|
|
@ *
|
|
@ * Copyright (C) 2015 The Android Open Source Project
|
|
@ *
|
|
@ * Licensed under the Apache License, Version 2.0 (the "License");
|
|
@ * you may not use this file except in compliance with the License.
|
|
@ * You may obtain a copy of the License at:
|
|
@ *
|
|
@ * http://www.apache.org/licenses/LICENSE-2.0
|
|
@ *
|
|
@ * Unless required by applicable law or agreed to in writing, software
|
|
@ * distributed under the License is distributed on an "AS IS" BASIS,
|
|
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
@ * See the License for the specific language governing permissions and
|
|
@ * limitations under the License.
|
|
@ *
|
|
@ *****************************************************************************
|
|
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
@*/
|
|
|
|
@/*
|
|
@//----------------------------------------------------------------------------
|
|
@// File Name : impeg2_inter_pred.s
|
|
@//
|
|
@// Description : This file has motion compensation related
|
|
@// interpolation functions on Neon + CortexA-8 platform
|
|
@//
|
|
@// Reference Document :
|
|
@//
|
|
@// Revision History :
|
|
@// Date Author Detail Description
|
|
@// ------------ ---------------- ----------------------------------
|
|
@// 18 jun 2010 S Hamsalekha Created
|
|
@//
|
|
@//-------------------------------------------------------------------------
|
|
@*/
|
|
|
|
@/*
|
|
@// ----------------------------------------------------------------------------
|
|
@// Include Files
|
|
@// ----------------------------------------------------------------------------
|
|
@*/
|
|
.text
|
|
.p2align 2
|
|
|
|
|
|
@/*
|
|
@// ----------------------------------------------------------------------------
|
|
@// Struct/Union Types and Define
|
|
@// ----------------------------------------------------------------------------
|
|
@*/
|
|
|
|
|
|
@/*
|
|
@// ----------------------------------------------------------------------------
|
|
@// Static Global Data section variables
|
|
@// ----------------------------------------------------------------------------
|
|
@*/
|
|
@// -------------------------- NONE --------------------------------------------
|
|
|
|
|
|
@/*
|
|
@// ----------------------------------------------------------------------------
|
|
@// Static Prototype Functions
|
|
@// ----------------------------------------------------------------------------
|
|
@*/
|
|
@// -------------------------- NONE --------------------------------------------
|
|
|
|
@/*
|
|
@// ----------------------------------------------------------------------------
|
|
@// Exported functions
|
|
@// ----------------------------------------------------------------------------
|
|
@*/
|
|
|
|
@//---------------------------------------------------------------------------
|
|
@// Function Name : impeg2_copy_mb_a9q()
|
|
@//
|
|
@// Detail Description : Copies one MB worth of data from src to the dst
|
|
@//
|
|
@// Inputs : r0 - pointer to src
|
|
@// r1 - pointer to dst
|
|
@// r2 - source width
|
|
@// r3 - destination width
|
|
@// Registers Used : r4, r5, d0, d1
|
|
@//
|
|
@// Stack Usage : 12 bytes
|
|
@//
|
|
@// Outputs :
|
|
@//
|
|
@// Return Data : None
|
|
@//
|
|
@// Programming Note : <program limitation>
|
|
@//-----------------------------------------------------------------------------
|
|
@*/
|
|
|
|
|
|
|
|
.global impeg2_copy_mb_a9q
|
|
|
|
|
|
impeg2_copy_mb_a9q:
|
|
|
|
stmfd sp!, {r4, r5, r14}
|
|
|
|
|
|
ldr r4, [r0] @src->y
|
|
ldr r5, [r1] @dst->y
|
|
@Read one row of data from the src
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
|
|
@//Repeat 15 times for y
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0, d1}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
|
|
|
|
mov r2, r2, lsr #1 @src_offset /= 2
|
|
mov r3, r3, lsr #1 @dst_offset /= 2
|
|
|
|
ldr r4, [r0, #4] @src->u
|
|
ldr r5, [r1, #4] @dst->u
|
|
@Read one row of data from the src
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
|
|
@//Repeat 7 times for u
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
|
|
ldr r4, [r0, #8] @src->v
|
|
ldr r5, [r1, #8] @dst->v
|
|
@Read one row of data from the src
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
|
|
@//Repeat 7 times for v
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
vld1.8 {d0}, [r4], r2 @Load and increment src
|
|
vst1.8 {d0}, [r5], r3 @Store and increment dst
|
|
|
|
ldmfd sp!, {r4, r5, pc}
|
|
|
|
|
|
|
|
|
|
@/*
|
|
@//---------------------------------------------------------------------------
|
|
@// Function Name : impeg2_mc_fullx_halfy_8x8_a9q()
|
|
@//
|
|
@// Detail Description : This function pastes the reference block in the
|
|
@// current frame buffer.This function is called for
|
|
@// blocks that are not coded and have motion vectors
|
|
@// with a half pel resolution.
|
|
@//
|
|
@// Inputs : r0 - out : Current Block Pointer
|
|
@// r1 - ref : Refernce Block Pointer
|
|
@// r2 - ref_wid : Refernce Block Width
|
|
@// r3 - out_wid ; Current Block Width
|
|
@//
|
|
@// Registers Used : D0-D9
|
|
@//
|
|
@// Stack Usage : 4 bytes
|
|
@//
|
|
@// Outputs : The Motion Compensated Block
|
|
@//
|
|
@// Return Data : None
|
|
@//
|
|
@// Programming Note : <program limitation>
|
|
@//-----------------------------------------------------------------------------
|
|
@*/
|
|
|
|
.global impeg2_mc_fullx_halfy_8x8_a9q
|
|
|
|
impeg2_mc_fullx_halfy_8x8_a9q:
|
|
|
|
stmfd sp!, {r14}
|
|
vpush {d8-d9}
|
|
add r14, r1, r2
|
|
mov r2, r2, lsl #1
|
|
|
|
@/* Load 8 + 1 rows from reference block */
|
|
@/* Do the addition with out rounding off as rounding value is 1 */
|
|
vld1.8 {d0}, [r1], r2 @// first row hence r1 = D0
|
|
vld1.8 {d2}, [r14], r2 @// second row hence r2 = D2
|
|
vld1.8 {d4}, [r1], r2 @// third row hence r3 = D4
|
|
vld1.8 {d6}, [r14], r2 @// fourth row hence r4 = D6
|
|
vld1.8 {d1}, [r1], r2 @// fifth row hence r5 = D1
|
|
vld1.8 {d3}, [r14], r2 @// sixth row hence r6 = D3
|
|
vrhadd.u8 d9, d1, d6 @// estimated row 4 = D9
|
|
vld1.8 {d5}, [r1], r2 @// seventh row hence r7 = D5
|
|
vrhadd.u8 q0, q0, q1 @// estimated row 1 = D0, row 5 = D1
|
|
vld1.8 {d7}, [r14], r2 @// eighth row hence r8 = D7
|
|
vrhadd.u8 q1, q1, q2 @// estimated row 2 = D2, row 6 = D3
|
|
vld1.8 {d8}, [r1], r2 @// ninth row hence r9 = D8
|
|
vrhadd.u8 q2, q2, q3 @// estimated row 3 = D4, row 7 = D5
|
|
|
|
add r14, r0, r3
|
|
mov r3, r3, lsl #1
|
|
|
|
@/* Store the eight rows calculated above */
|
|
vst1.8 {d2}, [r14], r3 @// second row hence D2
|
|
vrhadd.u8 d7, d7, d8 @// estimated row 8 = D7
|
|
vst1.8 {d0}, [r0], r3 @// first row hence D0
|
|
vst1.8 {d9}, [r14], r3 @// fourth row hence D9
|
|
vst1.8 {d4}, [r0], r3 @// third row hence D4
|
|
vst1.8 {d3}, [r14], r3 @// sixth row hence r6 = D3
|
|
vst1.8 {d1}, [r0], r3 @// fifth row hence r5 = D1
|
|
vst1.8 {d7}, [r14], r3 @// eighth row hence r8 = D7
|
|
vst1.8 {d5}, [r0], r3 @// seventh row hence r7 = D5
|
|
|
|
vpop {d8-d9}
|
|
ldmfd sp!, {pc}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@/*
|
|
@//---------------------------------------------------------------------------
|
|
@// Function Name : impeg2_mc_halfx_fully_8x8_a9q()
|
|
@//
|
|
@// Detail Description : This function pastes the reference block in the
|
|
@// current frame buffer.This function is called for
|
|
@// blocks that are not coded and have motion vectors
|
|
@// with a half pel resolutionand VopRoundingType is 0 ..
|
|
@//
|
|
@// Inputs : r0 - out : Current Block Pointer
|
|
@// r1 - ref : Refernce Block Pointer
|
|
@// r2 - ref_wid : Refernce Block Width
|
|
@// r3 - out_wid ; Current Block Width
|
|
@//
|
|
@// Registers Used : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22
|
|
|
|
@//
|
|
@// Stack Usage : 8 bytes
|
|
@//
|
|
@// Outputs : The Motion Compensated Block
|
|
@//
|
|
@// Return Data : None
|
|
@//
|
|
@// Programming Note : <program limitation>
|
|
@//-----------------------------------------------------------------------------
|
|
@*/
|
|
|
|
|
|
|
|
.global impeg2_mc_halfx_fully_8x8_a9q
|
|
|
|
|
|
|
|
impeg2_mc_halfx_fully_8x8_a9q:
|
|
|
|
stmfd sp!, {r12, lr}
|
|
|
|
add r14, r1, r2, lsl #2
|
|
|
|
add r12, r0, r3, lsl#2
|
|
|
|
vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1
|
|
|
|
vld1.8 {d2, d3}, [r14], r2 @ row5
|
|
|
|
|
|
vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2
|
|
|
|
vld1.8 {d6, d7}, [r14], r2 @row6
|
|
|
|
|
|
vext.8 d24, d0, d1, #1 @Extract pixels (1-8) of row1
|
|
|
|
vext.8 d28, d2, d3, #1 @Extract pixels (1-8) of row5
|
|
|
|
vext.8 d16, d4, d5, #1 @Extract pixels (1-8) of row2
|
|
|
|
vext.8 d20, d6, d7, #1 @Extract pixels (1-8) of row6
|
|
|
|
|
|
vld1.8 {d25, d26}, [r1], r2 @load row3
|
|
|
|
vld1.8 {d29, d30}, [r14], r2 @load row7
|
|
|
|
vld1.8 {d17, d18}, [r1], r2 @load row4
|
|
|
|
vld1.8 {d21, d22}, [r14], r2 @load row8
|
|
|
|
|
|
vext.8 d1, d25, d26, #1 @Extract pixels (1-8) of row3
|
|
|
|
vext.8 d3, d29, d30, #1 @Extract pixels (1-8) of row7
|
|
|
|
|
|
|
|
vext.8 d5, d17, d18, #1 @Extract pixels (1-8) of row4
|
|
|
|
vext.8 d7, d21, d22, #1 @Extract pixels (1-8) of row8
|
|
|
|
|
|
vrhadd.u8 q0, q0, q12 @operate on row1 and row3
|
|
|
|
vrhadd.u8 q1, q1, q14 @operate on row5 and row7
|
|
|
|
|
|
vrhadd.u8 q2, q2, q8 @operate on row2 and row4
|
|
|
|
|
|
|
|
vrhadd.u8 q3, q3, q10 @operate on row6 and row8
|
|
|
|
vst1.8 d0, [r0], r3 @store row1
|
|
|
|
vst1.8 d2, [r12], r3 @store row5
|
|
|
|
vst1.8 d4, [r0], r3 @store row2
|
|
|
|
vst1.8 d6, [r12], r3 @store row6
|
|
|
|
vst1.8 d1, [r0], r3 @store row3
|
|
|
|
vst1.8 d3, [r12], r3 @store row7
|
|
|
|
vst1.8 d5, [r0], r3 @store row4
|
|
|
|
vst1.8 d7, [r12], r3 @store row8
|
|
|
|
|
|
|
|
ldmfd sp!, {r12, pc}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@/*
|
|
@//---------------------------------------------------------------------------
|
|
@// Function Name : impeg2_mc_halfx_halfy_8x8_a9q()
|
|
@//
|
|
@// Detail Description : This function pastes the reference block in the
|
|
@// current frame buffer.This function is called for
|
|
@// blocks that are not coded and have motion vectors
|
|
@// with a half pel resolutionand VopRoundingType is 0 ..
|
|
@//
|
|
@// Inputs : r0 - out : Current Block Pointer
|
|
@// r1 - ref : Refernce Block Pointer
|
|
@// r2 - ref_wid : Refernce Block Width
|
|
@// r3 - out_wid ; Current Block Width
|
|
@//
|
|
@// Registers Used : r14, q0-q15
|
|
|
|
@//
|
|
@// Stack Usage : 4 bytes
|
|
@//
|
|
@// Outputs : The Motion Compensated Block
|
|
@//
|
|
@// Return Data : None
|
|
@//
|
|
@// Programming Note : <program limitation>
|
|
@//-----------------------------------------------------------------------------
|
|
@*/
|
|
|
|
|
|
.global impeg2_mc_halfx_halfy_8x8_a9q
|
|
|
|
impeg2_mc_halfx_halfy_8x8_a9q:
|
|
|
|
stmfd sp!, {r14}
|
|
vpush {d8-d15}
|
|
|
|
add r14, r1, r2, lsl #2
|
|
|
|
vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1
|
|
|
|
vld1.8 {d2, d3}, [r14], r2 @ row5
|
|
|
|
vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2
|
|
|
|
vld1.8 {d6, d7}, [r14], r2 @row6
|
|
|
|
vext.8 d1, d0, d1, #1 @Extract pixels (1-8) of row1
|
|
|
|
|
|
|
|
vext.8 d3, d2, d3, #1 @Extract pixels (1-8) of row5
|
|
|
|
|
|
|
|
vext.8 d5, d4, d5, #1 @Extract pixels (1-8) of row2
|
|
|
|
vext.8 d7, d6, d7, #1 @Extract pixels (1-8) of row6
|
|
|
|
|
|
|
|
|
|
vld1.8 {d8, d9}, [r1], r2 @load row3
|
|
|
|
|
|
|
|
vld1.8 {d10, d11}, [r14], r2 @load row7
|
|
|
|
vld1.8 {d12, d13}, [r1], r2 @load row4
|
|
|
|
vld1.8 {d14, d15}, [r14], r2 @load row8
|
|
|
|
vext.8 d9, d8, d9, #1 @Extract pixels (1-8) of row3
|
|
|
|
vld1.8 {d16, d17}, [r14], r2 @load row9
|
|
|
|
|
|
|
|
|
|
|
|
vext.8 d11, d10, d11, #1 @Extract pixels (1-8) of row7
|
|
|
|
|
|
|
|
vext.8 d13, d12, d13, #1 @Extract pixels (1-8) of row4
|
|
|
|
|
|
|
|
vext.8 d15, d14, d15, #1 @Extract pixels (1-8) of row8
|
|
|
|
vext.8 d17, d16, d17, #1 @Extract pixels (1-8) of row9
|
|
|
|
|
|
@interpolation in x direction
|
|
|
|
vaddl.u8 q0, d0, d1 @operate row1
|
|
|
|
vaddl.u8 q1, d2, d3 @operate row5
|
|
|
|
vaddl.u8 q2, d4, d5 @operate row2
|
|
|
|
vaddl.u8 q3, d6, d7 @operate row6
|
|
|
|
vaddl.u8 q4, d8, d9 @operate row3
|
|
|
|
vaddl.u8 q5, d10, d11 @operate row7
|
|
|
|
vaddl.u8 q6, d12, d13 @operate row4
|
|
|
|
vaddl.u8 q7, d14, d15 @operate row8
|
|
|
|
vaddl.u8 q8, d16, d17 @operate row9
|
|
|
|
@interpolation in y direction
|
|
|
|
add r14, r0, r3, lsl #2
|
|
|
|
|
|
|
|
vadd.u16 q9, q0, q2 @operate row1 and row2
|
|
|
|
vadd.u16 q13, q1, q3 @operate row5 and row6
|
|
|
|
vadd.u16 q10, q2, q4 @operate row2 and row3
|
|
|
|
vadd.u16 q14, q3, q5 @operate row6 and row7
|
|
|
|
vrshrn.u16 d18, q9, #2 @row1
|
|
|
|
vrshrn.u16 d26, q13, #2 @row5
|
|
|
|
vrshrn.u16 d20, q10, #2 @row2
|
|
|
|
vrshrn.u16 d28, q14, #2 @row6
|
|
|
|
vadd.u16 q11, q4, q6 @operate row3 and row4
|
|
|
|
vst1.8 d18, [r0], r3 @store row1
|
|
|
|
vadd.u16 q15, q5, q7 @operate row7 and row8
|
|
|
|
vst1.8 d26, [r14], r3 @store row5
|
|
|
|
vadd.u16 q12, q6, q1 @operate row4 and row5
|
|
|
|
vst1.8 d20, [r0], r3 @store row2
|
|
|
|
vadd.u16 q7, q7, q8 @operate row8 and row9
|
|
|
|
vst1.8 d28, [r14], r3 @store row6
|
|
|
|
|
|
|
|
vrshrn.u16 d22, q11, #2 @row3
|
|
|
|
vrshrn.u16 d30, q15, #2 @row7
|
|
|
|
vrshrn.u16 d24, q12, #2 @row4
|
|
|
|
vrshrn.u16 d14, q7, #2 @row8
|
|
|
|
|
|
vst1.8 d22, [r0], r3 @store row3
|
|
vst1.8 d30, [r14], r3 @store row7
|
|
vst1.8 d24, [r0], r3 @store row4
|
|
vst1.8 d14, [r14], r3 @store row8
|
|
|
|
|
|
|
|
vpop {d8-d15}
|
|
ldmfd sp!, {pc}
|
|
|
|
|
|
|
|
|
|
|
|
@/*
|
|
@//---------------------------------------------------------------------------
|
|
@// Function Name : impeg2_mc_fullx_fully_8x8_a9q()
|
|
@//
|
|
@// Detail Description : This function pastes the reference block in the
|
|
@// current frame buffer.This function is called for
|
|
@// blocks that are not coded and have motion vectors
|
|
@// with a half pel resolutionand ..
|
|
@//
|
|
@// Inputs : r0 - out : Current Block Pointer
|
|
@// r1 - ref : Refernce Block Pointer
|
|
@// r2 - ref_wid : Refernce Block Width
|
|
@// r3 - out_wid ; Current Block Width
|
|
@//
|
|
@// Registers Used : r12, r14, d0-d3
|
|
|
|
@//
|
|
@// Stack Usage : 8 bytes
|
|
@//
|
|
@// Outputs : The Motion Compensated Block
|
|
@//
|
|
@// Return Data : None
|
|
@//
|
|
@// Programming Note : <program limitation>
|
|
@//-----------------------------------------------------------------------------
|
|
@*/
|
|
|
|
|
|
.global impeg2_mc_fullx_fully_8x8_a9q
|
|
impeg2_mc_fullx_fully_8x8_a9q:
|
|
|
|
|
|
stmfd sp!, {r12, lr}
|
|
|
|
add r14, r1, r2, lsl #2
|
|
|
|
add r12, r0, r3, lsl #2
|
|
|
|
|
|
vld1.8 d0, [r1], r2 @load row1
|
|
|
|
vld1.8 d1, [r14], r2 @load row4
|
|
|
|
vld1.8 d2, [r1], r2 @load row2
|
|
|
|
vld1.8 d3, [r14], r2 @load row5
|
|
|
|
|
|
vst1.8 d0, [r0], r3 @store row1
|
|
|
|
vst1.8 d1, [r12], r3 @store row4
|
|
|
|
vst1.8 d2, [r0], r3 @store row2
|
|
|
|
vst1.8 d3, [r12], r3 @store row5
|
|
|
|
|
|
vld1.8 d0, [r1], r2 @load row3
|
|
|
|
vld1.8 d1, [r14], r2 @load row6
|
|
|
|
vld1.8 d2, [r1], r2 @load row4
|
|
|
|
vld1.8 d3, [r14], r2 @load row8
|
|
|
|
|
|
vst1.8 d0, [r0], r3 @store row3
|
|
|
|
vst1.8 d1, [r12], r3 @store row6
|
|
|
|
vst1.8 d2, [r0], r3 @store row4
|
|
|
|
vst1.8 d3, [r12], r3 @store row8
|
|
|
|
|
|
ldmfd sp!, {r12, pc}
|
|
|
|
|
|
|
|
|
|
|
|
@/*
|
|
@//---------------------------------------------------------------------------
|
|
@// Function Name : impeg2_interpolate_a9q()
|
|
@//
|
|
@// Detail Description : interpolates two buffers and adds pred
|
|
@//
|
|
@// Inputs : r0 - pointer to src1
|
|
@// r1 - pointer to src2
|
|
@// r2 - dest buf
|
|
@// r3 - dst stride
|
|
@// Registers Used : r4, r5, r7, r14, d0-d15
|
|
@//
|
|
@// Stack Usage : 20 bytes
|
|
@//
|
|
@// Outputs : The Motion Compensated Block
|
|
@//
|
|
@// Return Data : None
|
|
@//
|
|
@// Programming Note : <program limitation>
|
|
@//-----------------------------------------------------------------------------
|
|
@*/
|
|
|
|
|
|
.global impeg2_interpolate_a9q
|
|
|
|
|
|
impeg2_interpolate_a9q:
|
|
|
|
stmfd sp!, {r4, r5, r7, r12, r14}
|
|
vpush {d8-d15}
|
|
|
|
ldr r4, [r0, #0] @ptr_y src1
|
|
|
|
ldr r5, [r1, #0] @ptr_y src2
|
|
|
|
ldr r7, [r2, #0] @ptr_y dst buf
|
|
|
|
mov r12, #4 @counter for number of blocks
|
|
|
|
|
|
interp_lumablocks_stride:
|
|
|
|
vld1.8 {d0, d1}, [r4]! @row1 src1
|
|
|
|
vld1.8 {d2, d3}, [r4]! @row2 src1
|
|
|
|
vld1.8 {d4, d5}, [r4]! @row3 src1
|
|
|
|
vld1.8 {d6, d7}, [r4]! @row4 src1
|
|
|
|
|
|
vld1.8 {d8, d9}, [r5]! @row1 src2
|
|
|
|
vld1.8 {d10, d11}, [r5]! @row2 src2
|
|
|
|
vld1.8 {d12, d13}, [r5]! @row3 src2
|
|
|
|
vld1.8 {d14, d15}, [r5]! @row4 src2
|
|
|
|
|
|
|
|
|
|
vrhadd.u8 q0, q0, q4 @operate on row1
|
|
|
|
vrhadd.u8 q1, q1, q5 @operate on row2
|
|
|
|
vrhadd.u8 q2, q2, q6 @operate on row3
|
|
|
|
vrhadd.u8 q3, q3, q7 @operate on row4
|
|
|
|
|
|
|
|
vst1.8 {d0, d1}, [r7], r3 @row1
|
|
|
|
vst1.8 {d2, d3}, [r7], r3 @row2
|
|
|
|
vst1.8 {d4, d5}, [r7], r3 @row3
|
|
|
|
vst1.8 {d6, d7}, [r7], r3 @row4
|
|
|
|
subs r12, r12, #1
|
|
|
|
bne interp_lumablocks_stride
|
|
|
|
|
|
mov r3, r3, lsr #1 @stride >> 1
|
|
|
|
ldr r4, [r0, #4] @ptr_u src1
|
|
|
|
ldr r5, [r1, #4] @ptr_u src2
|
|
|
|
ldr r7 , [r2, #4] @ptr_u dst buf
|
|
|
|
mov r12, #2 @counter for number of blocks
|
|
|
|
|
|
|
|
@chroma blocks
|
|
|
|
interp_chromablocks_stride:
|
|
|
|
vld1.8 {d0, d1}, [r4]! @row1 & 2 src1
|
|
|
|
vld1.8 {d2, d3}, [r4]! @row3 & 4 src1
|
|
|
|
vld1.8 {d4, d5}, [r4]! @row5 & 6 src1
|
|
|
|
vld1.8 {d6, d7}, [r4]! @row7 & 8 src1
|
|
|
|
|
|
vld1.8 {d8, d9}, [r5]! @row1 & 2 src2
|
|
|
|
vld1.8 {d10, d11}, [r5]! @row3 & 4 src2
|
|
|
|
vld1.8 {d12, d13}, [r5]! @row5 & 6 src2
|
|
|
|
vld1.8 {d14, d15}, [r5]! @row7 & 8 src2
|
|
|
|
|
|
|
|
|
|
vrhadd.u8 q0, q0, q4 @operate on row1 & 2
|
|
|
|
vrhadd.u8 q1, q1, q5 @operate on row3 & 4
|
|
|
|
vrhadd.u8 q2, q2, q6 @operate on row5 & 6
|
|
|
|
vrhadd.u8 q3, q3, q7 @operate on row7 & 8
|
|
|
|
|
|
vst1.8 {d0}, [r7], r3 @row1
|
|
|
|
vst1.8 {d1}, [r7], r3 @row2
|
|
|
|
vst1.8 {d2}, [r7], r3 @row3
|
|
|
|
vst1.8 {d3}, [r7], r3 @row4
|
|
|
|
vst1.8 {d4}, [r7], r3 @row5
|
|
|
|
vst1.8 {d5}, [r7], r3 @row6
|
|
|
|
vst1.8 {d6}, [r7], r3 @row7
|
|
|
|
vst1.8 {d7}, [r7], r3 @row8
|
|
|
|
|
|
|
|
ldr r4, [r0, #8] @ptr_v src1
|
|
|
|
ldr r5, [r1, #8] @ptr_v src2
|
|
|
|
ldr r7, [r2, #8] @ptr_v dst buf
|
|
|
|
subs r12, r12, #1
|
|
|
|
bne interp_chromablocks_stride
|
|
|
|
|
|
vpop {d8-d15}
|
|
ldmfd sp!, {r4, r5, r7, r12, pc}
|
|
|
|
|
|
|
|
|
|
|