You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
348 lines
14 KiB
348 lines
14 KiB
@/******************************************************************************
|
|
@ *
|
|
@ * Copyright (C) 2015 The Android Open Source Project
|
|
@ *
|
|
@ * Licensed under the Apache License, Version 2.0 (the "License");
|
|
@ * you may not use this file except in compliance with the License.
|
|
@ * You may obtain a copy of the License at:
|
|
@ *
|
|
@ * http://www.apache.org/licenses/LICENSE-2.0
|
|
@ *
|
|
@ * Unless required by applicable law or agreed to in writing, software
|
|
@ * distributed under the License is distributed on an "AS IS" BASIS,
|
|
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
@ * See the License for the specific language governing permissions and
|
|
@ * limitations under the License.
|
|
@ *
|
|
@ *****************************************************************************
|
|
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
@*/
|
|
|
|
.text
|
|
.p2align 2
|
|
|
|
@/*****************************************************************************
|
|
@* *
|
|
@* Function Name : IH264D_CXA8_YUV420toYUV420SP_UV() *
|
|
@* *
|
|
@* Description : This function conversts the image from YUV420P color *
|
|
@* space to 420SP color space(UV interleaved). *
|
|
@* *
|
|
@* Arguments : R0 pu1_y *
|
|
@* R1 pu1_u *
|
|
@* R2 pu1_v *
|
|
@* R3 pu1_dest_y *
|
|
@* [R13 #40] pu1_dest_uv *
|
|
@* [R13 #44] u2_height *
|
|
@* [R13 #48] u2_width *
|
|
@* [R13 #52] u2_stridey *
|
|
@* [R13 #56] u2_strideu *
|
|
@* [R13 #60] u2_stridev *
|
|
@* [R13 #64] u2_dest_stride_y *
|
|
@* [R13 #68] u2_dest_stride_uv *
|
|
@* [R13 #72] convert_uv_only *
|
|
@* *
|
|
@* Values Returned : None *
|
|
@* *
|
|
@* Register Usage : R0 - R14 *
|
|
@* *
|
|
@* Stack Usage : 40 Bytes *
|
|
@* *
|
|
@* Interruptibility : Interruptible *
|
|
@* *
|
|
@* Known Limitations *
|
|
@* Assumptions: Image Width: Assumed to be multiple of 16 and *
|
|
@* greater than or equal to 16 *
|
|
@* Image Height: Assumed to be even. *
|
|
@* *
|
|
@* Revision History : *
|
|
@* DD MM YYYY Author(s) Changes (Describe the changes made) *
|
|
@* 07 06 2010 Varshita Draft *
|
|
@* 07 06 2010 Naveen Kr T Completed *
|
|
@* *
|
|
@*****************************************************************************/
|
|
.global ih264e_fmt_conv_420p_to_420sp_a9q
|
|
|
|
ih264e_fmt_conv_420p_to_420sp_a9q:
|
|
|
|
@// push the registers on the stack
|
|
stmfd sp!, {r4-r12, lr}
|
|
|
|
ldr r4, [sp, #72] @// Load convert_uv_only
|
|
|
|
cmp r4, #1
|
|
beq yuv420sp_uv_chroma
|
|
@/* Do the preprocessing before the main loops start */
|
|
@// Load the parameters from stack
|
|
ldr r4, [sp, #44] @// Load u2_height from stack
|
|
ldr r5, [sp, #48] @// Load u2_width from stack
|
|
ldr r7, [sp, #52] @// Load u2_stridey from stack
|
|
ldr r8, [sp, #64] @// Load u2_dest_stride_y from stack
|
|
sub r7, r7, r5 @// Source increment
|
|
sub r8, r8, r5 @// Destination increment
|
|
|
|
yuv420sp_uv_row_loop_y:
|
|
mov r6, r5
|
|
|
|
yuv420sp_uv_col_loop_y:
|
|
pld [r0, #128]
|
|
vld1.8 {d0, d1}, [r0]!
|
|
vst1.8 {d0, d1}, [r3]!
|
|
sub r6, r6, #16
|
|
cmp r6, #15
|
|
bgt yuv420sp_uv_col_loop_y
|
|
|
|
cmp r6, #0
|
|
beq yuv420sp_uv_row_loop_end_y
|
|
@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
|
|
@//Ex if width is 162, above loop will process 160 pixels. And
|
|
@//Both source and destination will point to 146th pixel and then 16 bytes will be read
|
|
@// and written using VLD1 and VST1
|
|
rsb r6, r6, #16
|
|
sub r0, r0, r6
|
|
sub r3, r3, r6
|
|
|
|
vld1.8 {d0, d1}, [r0]!
|
|
vst1.8 {d0, d1}, [r3]!
|
|
|
|
yuv420sp_uv_row_loop_end_y:
|
|
add r0, r0, r7
|
|
add r3, r3, r8
|
|
subs r4, r4, #1
|
|
bgt yuv420sp_uv_row_loop_y
|
|
|
|
yuv420sp_uv_chroma:
|
|
|
|
ldr r3, [sp, #40] @// Load pu1_dest_uv from stack
|
|
|
|
ldr r4, [sp, #44] @// Load u2_height from stack
|
|
|
|
ldr r5, [sp, #48] @// Load u2_width from stack
|
|
|
|
|
|
ldr r7, [sp, #56] @// Load u2_strideu from stack
|
|
|
|
ldr r8, [sp, #68] @// Load u2_dest_stride_uv from stack
|
|
|
|
sub r7, r7, r5, lsr #1 @// Source increment
|
|
|
|
sub r8, r8, r5 @// Destination increment
|
|
|
|
mov r5, r5, lsr #1
|
|
mov r4, r4, lsr #1
|
|
ldr r3, [sp, #40] @// Load pu1_dest_uv from stack
|
|
|
|
yuv420sp_uv_row_loop_uv:
|
|
mov r6, r5
|
|
|
|
|
|
yuv420sp_uv_col_loop_uv:
|
|
pld [r1, #128]
|
|
pld [r2, #128]
|
|
vld1.8 d0, [r1]!
|
|
vld1.8 d1, [r2]!
|
|
vst2.8 {d0, d1}, [r3]!
|
|
sub r6, r6, #8
|
|
cmp r6, #7
|
|
bgt yuv420sp_uv_col_loop_uv
|
|
|
|
cmp r6, #0
|
|
beq yuv420sp_uv_row_loop_end_uv
|
|
@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
|
|
@//Ex if width is 162, above loop will process 160 pixels. And
|
|
@//Both source and destination will point to 146th pixel and then 16 bytes will be read
|
|
@// and written using VLD1 and VST1
|
|
rsb r6, r6, #8
|
|
sub r1, r1, r6
|
|
sub r2, r2, r6
|
|
sub r3, r3, r6, lsl #1
|
|
|
|
vld1.8 d0, [r1]!
|
|
vld1.8 d1, [r2]!
|
|
vst2.8 {d0, d1}, [r3]!
|
|
|
|
yuv420sp_uv_row_loop_end_uv:
|
|
add r1, r1, r7
|
|
add r2, r2, r7
|
|
add r3, r3, r8
|
|
subs r4, r4, #1
|
|
bgt yuv420sp_uv_row_loop_uv
|
|
@//POP THE REGISTERS
|
|
ldmfd sp!, {r4-r12, pc}
|
|
|
|
|
|
|
|
|
|
|
|
@ /**
|
|
@ *******************************************************************************
|
|
@ *
|
|
@ * @brief ih264e_fmt_conv_422i_to_420sp_a9q
|
|
@ * Function used from format conversion or frame copy
|
|
@ *
|
|
@ *
|
|
@ *
|
|
@ *Inputs : r0 - pu1_y - UWORD8 pointer to y plane.
|
|
@ * r1 - pu1_u - UWORD8 pointer to u plane.
|
|
@ * r2 - pu1_v - UWORD8 pointer to u plane.
|
|
@ * r3 - pu2_yuv422i - UWORD16 pointer to yuv422iimage.
|
|
@ * stack + 40 - u4_width - Width of the Y plane.
|
|
@ * 44 - u4_height - Height of the Y plane.
|
|
@ * 48 - u4_stride_y - Stride in pixels of Y plane.
|
|
@ * 52 - u4_stride_u - Stride in pixels of U plane.
|
|
@ * 56 - u4_stride_v - Stride in pixels of V plane.
|
|
@ * 60 - u4_stride_yuv422i- Stride in pixels of yuv422i image.
|
|
@ *
|
|
@ * @par Description
|
|
@ * Function used from copying or converting a reference frame to display buffer
|
|
@ * in non shared mode
|
|
@ *
|
|
@ * @param[in] pu1_y_dst
|
|
@ * Output Y pointer
|
|
@ *
|
|
@ * @param[in] pu1_u_dst
|
|
@ * Output U/UV pointer ( UV is interleaved in the same format as that of input)
|
|
@ *
|
|
@ * @param[in] pu1_v_dst
|
|
@ * Output V pointer ( used in 420P output case)
|
|
@ *
|
|
@ * @param[in] u4_dst_y_strd
|
|
@ * Stride of destination Y buffer
|
|
@ *
|
|
@ * @param[in] u4_dst_u_strd
|
|
@ * Stride of destination U/V buffer
|
|
@ *
|
|
@ *
|
|
@ * @param[in] blocking
|
|
@ * To indicate whether format conversion should wait till frame is reconstructed
|
|
@ * and then return after complete copy is done. To be set to 1 when called at the
|
|
@ * end of frame processing and set to 0 when called between frame processing modules
|
|
@ * in order to utilize available MCPS
|
|
@ *
|
|
@ * @returns Error from IH264E_ERROR_T
|
|
@ *
|
|
@ * @remarks
|
|
@ * Assumes that the stride of U and V buffers are same.
|
|
@ * This is correct in most cases
|
|
@ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also
|
|
@ * Since we read 4 pixels ata time the width should be aligned to 4
|
|
@ * In assembly width should be aligned to 16 and height to 2.
|
|
@ *
|
|
@ *
|
|
@ * Revision History :
|
|
@ * DD MM YYYY Author(s) Changes (Describe the changes made)
|
|
@ * 07 06 2010 Harinarayanan K K Adapeted to 422p
|
|
@ *
|
|
@ *******************************************************************************
|
|
@ */
|
|
|
|
@//`
|
|
@*/
|
|
.global ih264e_fmt_conv_422i_to_420sp_a9q
|
|
ih264e_fmt_conv_422i_to_420sp_a9q:
|
|
stmfd sp!, {r4-r12, lr} @// Back the register which are used
|
|
|
|
|
|
|
|
@/* Do the preprocessing before the main loops start */
|
|
@// Load the parameters from stack
|
|
ldr r4, [sp, #48] @// Load u4_stride_y from stack
|
|
|
|
ldr r5, [sp, #60] @// Load u4_stride_yuv422i from stack
|
|
add r6, r0, r4 @// pu1_y_nxt_row = pu1_y + u4_stride_y
|
|
|
|
ldr r7, [sp, #40] @// Load u4_width from stack
|
|
add r8, r3, r5, lsl #1 @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel)
|
|
|
|
ldr r9, [sp, #52] @// Load u4_stride_u from stack
|
|
sub r12, r4, r7 @// u2_offset1 = u4_stride_y - u4_width
|
|
|
|
@LDR r10,[sp,#56] ;// Load u4_stride_v from stack
|
|
sub r14, r5, r7 @// u2_offset_yuv422i = u4_stride_yuv422i - u4_width
|
|
|
|
ldr r11, [sp, #44] @// Load u4_height from stack
|
|
sub r9, r9, r7 @// u2_offset2 = u4_stride_u - u4_width >> 1
|
|
|
|
@ SUB r10,r10,r7,ASR #1 ;// u2_offset3 = u4_stride_v - u4_width >> 1
|
|
mov r14, r14, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i * 2
|
|
|
|
mov r11, r11, asr #1 @// u4_width = u4_width / 2 (u4_width >> 1)
|
|
|
|
add r4, r12, r4 @// u2_offset1 = u2_offset1 + u4_stride_y
|
|
add r5, r14, r5, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i
|
|
|
|
@// Register Assignment
|
|
@// pu1_y - r0
|
|
@// pu1_y_nxt_row - r6
|
|
@// pu1_u - r1
|
|
@// pu1_v - r2
|
|
@// pu2_yuv422i - r3
|
|
@// pu2_yuv422i_nxt_row - r8
|
|
@// u2_offset1 - r4
|
|
@// u2_offset2 - r9
|
|
@// u2_offset3 - r10
|
|
@// u2_offset_yuv422i - r5
|
|
@// u4_width / 16 - r7
|
|
@// u4_height / 2 - r11
|
|
@// inner loop count - r12
|
|
yuv422i_to_420sp_height_loop:
|
|
|
|
mov r12, r7 @// Inner loop count = u4_width / 16
|
|
|
|
yuv422i_to_420sp_width_loop:
|
|
vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
|
|
vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
|
|
sub r12, r12, #16
|
|
|
|
vrhadd.u8 d0, d0, d4
|
|
vrhadd.u8 d2, d2, d6
|
|
|
|
vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y
|
|
vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y
|
|
|
|
vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U
|
|
|
|
cmp r12, #15
|
|
bgt yuv422i_to_420sp_width_loop
|
|
cmp r12, #0
|
|
beq yuv422i_to_420sp_row_loop_end
|
|
|
|
@//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
|
|
@//Ex if width is 162, above loop will process 160 pixels. And
|
|
@//Both source and destination will point to 146th pixel and then 16 bytes will be read
|
|
@// and written using VLD1 and VST1
|
|
rsb r12, r12, #16
|
|
sub r3, r3, r12, lsl #1
|
|
sub r8, r8, r12, lsl #1
|
|
sub r0, r0, r12
|
|
sub r6, r6, r12
|
|
sub r1, r1, r12
|
|
|
|
vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1
|
|
vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2
|
|
|
|
vrhadd.u8 d0, d0, d4
|
|
vrhadd.u8 d2, d2, d6
|
|
|
|
vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y
|
|
vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y
|
|
|
|
vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U
|
|
|
|
yuv422i_to_420sp_row_loop_end:
|
|
@// Update the buffer pointer so that they will refer to next pair of rows
|
|
add r0, r0, r4 @// pu1_y = pu1_y + u2_offset1
|
|
add r6, r6, r4 @// pu1_y_nxt_row = pu1_y_nxt_row + u2_offset1
|
|
|
|
add r1, r1, r9 @// pu1_u = pu1_u + u2_offset2
|
|
subs r11, r11, #1
|
|
|
|
add r3, r3, r5 @// pu2_yuv422i = pu2_yuv422i + u2_offset_yuv422i
|
|
|
|
add r8, r8, r5 @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row + u2_offset_yuv422i
|
|
bgt yuv422i_to_420sp_height_loop
|
|
ldmfd sp!, {r4-r12, pc} @// Restore the register which are used
|
|
|
|
|
|
|