You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
530 lines
23 KiB
530 lines
23 KiB
|
|
|
|
.text
|
|
.p2align 2
|
|
.global ixheaacd_fft_15_ld_armv7
|
|
|
|
ixheaacd_fft_15_ld_armv7:
|
|
|
|
STMFD r13!, {r4 - r12, r14} @
|
|
STR r1 , [r13, #-4]! @
|
|
STR r3 , [r13, #-4]! @
|
|
MOV lr, r2 @ lr - fft3out
|
|
MOV r12, #384 @
|
|
|
|
|
|
LOOP_FFT5:
|
|
LDRD r2, [r0] @ r2 = buf1a[0] and r3 = buf1a[1]
|
|
ADD r0, r0, r12
|
|
LDRD r4, [r0] @ r4 = buf1a[2] and r5 = buf1a[3]
|
|
ADD r0, r0, r12
|
|
LDRD r6, [r0] @ r6 = buf1a[4] and r7 = buf1a[5]
|
|
ADD r0, r0, r12
|
|
LDRD r8, [r0] @ r8 = buf1a[6] and r9 = buf1a[7]
|
|
ADD r0, r0, r12
|
|
LDRD r10, [r0] @ r10 = buf1a[8] and r11 = buf1a[9]
|
|
|
|
|
|
ADD r1, r4, r10 @ r1 = buf1a[2] + buf1a[8]
|
|
SUB r4, r4, r10 @ r4 = buf1a[2] - buf1a[8]@
|
|
MOVW r10, #0xB000
|
|
MOVT r10, #0x478E
|
|
ADD r12, r6, r8 @ r3 = buf1a[4] + buf1a[6]
|
|
SUB r8, r6, r8 @ r2 = buf1a[4] - buf1a[6]
|
|
|
|
SUB r6, r1, r12 @ (r1 - r3)
|
|
SMULWT r6, r6, r10 @ t = mult32x16in32_shl((r1 - r3), C54)
|
|
ADD r1, r1, r12 @ r1 = r1 + r3@
|
|
ADD r2, r2, r1 @ temp1 = inp[0] + r1@
|
|
SMULWB r1, r1, r10 @ mult32_shl(r1, C55)
|
|
ADD r1, r2, r1, lsl #2 @ r1 = temp1 + ((mult32_shl(r1, C55)) << 1)@
|
|
MOVW r10, #0x9D84
|
|
MOVT r10, #0x79BC
|
|
STR r2, [lr], #4 @ *buf2++ = temp1@
|
|
|
|
SUB r12, r1, r6, LSL #1 @ r3 = r1 - t@
|
|
ADD r1, r1, r6, LSL #1 @ r1 = r1 + t@
|
|
|
|
ADD r2, r4, r8 @ (r4 + r2)
|
|
SMULWT r2, r2, r10 @ t = mult32_shl((r4 + r2), C51)@
|
|
|
|
@LSL r2, r2, #1
|
|
MOV r2, r2, LSL #1
|
|
|
|
SMULWB r4, r4, r10 @ mult32_shl(r4, C52)
|
|
MOVW r10, #0xD180
|
|
MOVT r10, #0xFFFF
|
|
ADD r4, r2, r4, LSL #2 @ r4 = t + (mult32_shl(r4, C52) << 1)@
|
|
|
|
SMULWB r8, r8, r10 @ mult32_shl(r2, C53)
|
|
ADD r2, r2, r8, LSL #1 @ r2 = t + mult32_shl(r2, C53)@
|
|
|
|
ADD r6, r5, r11 @ s1 = buf1a[3] + buf1a[9]
|
|
SUB r8, r5, r11 @ s4 = buf1a[3] - buf1a[9]
|
|
MOVW r10, #0xB000
|
|
MOVT r10, #0x478E
|
|
ADD r5, r7, r9 @ s3 = buf1a[5] + buf1a[7]@
|
|
SUB r7, r7, r9 @ s2 = buf1a[5] + buf1a[7]@
|
|
|
|
|
|
SUB r9, r6, r5 @ (s1 - s3)
|
|
SMULWT r9, r9, r10 @ t = mult32x16in32_shl((s1 - s3), C54)
|
|
ADD r6, r6, r5 @ s1 = s1 + s3@
|
|
ADD r3, r3, r6 @ temp2 = buf1a[1] + s1
|
|
SMULWB r6, r6, r10 @ mult32_shl(s1, C55)
|
|
ADD r6, r3, r6, lsl #2 @ s1 = temp1 + ((mult32_shl(s1, C55)) << 1)@
|
|
MOVW r10, #0x9D84
|
|
MOVT r10, #0x79BC
|
|
STR r3, [lr], #4 @ *buf2++ = temp2@
|
|
|
|
SUB r5, r6, r9, LSL #1 @ s3 = s1 - t@
|
|
ADD r6, r6, r9, LSL #1 @ s1 = s1 + t@
|
|
SUB r0, r0, #896 @ r0 -inp[160]
|
|
|
|
ADD r11, r7, r8 @ (s4 + s2)
|
|
SMULWT r11, r11, r10 @ t = mult32_shl((s4 + s2), C51)@
|
|
@LSL r11, r11, #1 @
|
|
MOV r11, r11, LSL #1
|
|
|
|
|
|
SMULWB r8, r8, r10 @ mult32_shl(s4, C52)
|
|
MOVW r10, #0xD180
|
|
MOVT r10, #0xFFFF
|
|
ADD r8, r11, r8, LSL #2 @ s4 = t + (mult32_shl(s4, C52) << 1)@
|
|
|
|
SMULWB r7, r7, r10 @ mult32_shl(s2, C53)
|
|
ADD r7, r11, r7, LSL #1 @ s2 = t + mult32_shl(s2, C53)@
|
|
|
|
|
|
ADD r3, r1, r7 @ buf2[2] = r1 + s2
|
|
SUB r9, r6, r2 @ buf2[3] = s1 - r2
|
|
SUB r10, r12, r8 @ buf2[4] = r3 - s4
|
|
ADD r11, r5, r4 @ buf2[5] = s3 + r4
|
|
ADD r12, r12, r8 @ buf2[6] = r3 + s4
|
|
SUB r4, r5, r4 @ buf2[7] = s3 - r4
|
|
SUB r5, r1, r7 @ buf2[8] = r1 - s2
|
|
ADD r6, r6, r2 @ buf2[9] = s1 + r2
|
|
STMIA lr!, {r3, r9-r12} @
|
|
|
|
MOV r12, #384 @
|
|
MOVW r1, #0xFA00
|
|
MOVT r1, #0xFFFF
|
|
|
|
STMIA lr!, {r4-r6} @
|
|
|
|
|
|
LDRD r2, [r0] @ r2 = buf1a[0] and r3 = buf1a[1]
|
|
ADD r0, r0, r12
|
|
LDRD r4, [r0] @ r4 = buf1a[2] and r5 = buf1a[3]
|
|
ADD r0, r0, r12
|
|
LDRD r6, [r0] @ r6 = buf1a[4] and r7 = buf1a[5]
|
|
ADD r0, r0, r12
|
|
LDRD r8, [r0] @ r8 = buf1a[6] and r9 = buf1a[7]
|
|
ADD r0, r0, r1
|
|
LDRD r10, [r0] @ r10 = buf1a[8] and r11 = buf1a[9]
|
|
ADD r0, r0, #1024 @ r0 -inp[320]
|
|
|
|
ADD r1, r4, r10 @ r1 = buf1a[2] + buf1a[8]
|
|
SUB r4, r4, r10 @ r4 = buf1a[2] - buf1a[8]@
|
|
MOVW r10, #0xB000
|
|
MOVT r10, #0x478E
|
|
ADD r12, r6, r8 @ r3 = buf1a[4] + buf1a[6]
|
|
SUB r8, r6, r8 @ r2 = buf1a[4] - buf1a[6]
|
|
|
|
SUB r6, r1, r12 @ (r1 - r3)
|
|
SMULWT r6, r6, r10 @ t = mult32x16in32_shl((r1 - r3), C54)
|
|
ADD r1, r1, r12 @ r1 = r1 + r3@
|
|
ADD r2, r2, r1 @ temp1 = inp[0] + r1@
|
|
SMULWB r1, r1, r10 @ mult32_shl(r1, C55)
|
|
ADD r1, r2, r1, lsl #2 @ r1 = temp1 + ((mult32_shl(r1, C55)) << 1)@
|
|
MOVW r10, #0x9D84
|
|
MOVT r10, #0x79BC
|
|
STR r2, [lr], #4 @ *buf2++ = temp1@
|
|
|
|
SUB r12, r1, r6, LSL #1 @ r3 = r1 - t@
|
|
ADD r1, r1, r6, LSL #1 @ r1 = r1 + t@
|
|
|
|
ADD r2, r4, r8 @ (r4 + r2)
|
|
SMULWT r2, r2, r10 @ t = mult32_shl((r4 + r2), C51)@
|
|
@LSL r2, r2, #1
|
|
MOV r2, r2, LSL #1
|
|
|
|
|
|
SMULWB r4, r4, r10 @ mult32_shl(r4, C52)
|
|
MOVW r10, #0xD180
|
|
MOVT r10, #0xFFFF
|
|
ADD r4, r2, r4, LSL #2 @ r4 = t + (mult32_shl(r4, C52) << 1)@
|
|
|
|
SMULWB r8, r8, r10 @ mult32_shl(r2, C53)
|
|
ADD r2, r2, r8, LSL #1 @ r2 = t + mult32_shl(r2, C53)@
|
|
|
|
ADD r6, r5, r11 @ s1 = buf1a[3] + buf1a[9]
|
|
SUB r8, r5, r11 @ s4 = buf1a[3] - buf1a[9]
|
|
MOVW r10, #0xB000
|
|
MOVT r10, #0x478E
|
|
ADD r5, r7, r9 @ s3 = buf1a[5] + buf1a[7]@
|
|
SUB r7, r7, r9 @ s2 = buf1a[5] + buf1a[7]@
|
|
|
|
|
|
SUB r9, r6, r5 @ (s1 - s3)
|
|
SMULWT r9, r9, r10 @ t = mult32x16in32_shl((s1 - s3), C54)
|
|
ADD r6, r6, r5 @ s1 = s1 + s3@
|
|
ADD r3, r3, r6 @ temp2 = buf1a[1] + s1
|
|
SMULWB r6, r6, r10 @ mult32_shl(s1, C55)
|
|
ADD r6, r3, r6, lsl #2 @ s1 = temp1 + ((mult32_shl(s1, C55)) << 1)@
|
|
MOVW r10, #0x9D84
|
|
MOVT r10, #0x79BC
|
|
STR r3, [lr], #4 @ *buf2++ = temp2@
|
|
|
|
|
|
SUB r5, r6, r9, LSL #1 @ s3 = s1 - t@
|
|
ADD r6, r6, r9, LSL #1 @ s1 = s1 + t@
|
|
|
|
ADD r11, r7, r8 @ (s4 + s2)
|
|
SMULWT r11, r11, r10 @ t = mult32_shl((s4 + s2), C51)@
|
|
@LSL r11, r11, #1
|
|
MOV r11, r11, LSL #1
|
|
|
|
SMULWB r8, r8, r10 @mult32_shl(s4, C52)
|
|
MOVW r10, #0xD180
|
|
MOVT r10, #0xFFFF
|
|
ADD r8, r11, r8, LSL #2 @s4 = t + (mult32_shl(s4, C52) << 1)@
|
|
|
|
SMULWB r7, r7, r10 @mult32_shl(s2, C53)
|
|
ADD r7, r11, r7, LSL #1 @s2 = t + mult32_shl(s2, C53)@
|
|
|
|
ADD r3, r1, r7 @buf2[2] = r1 + s2
|
|
SUB r9, r6, r2 @buf2[3] = s1 - r2
|
|
SUB r10, r12, r8 @buf2[4] = r3 - s4
|
|
ADD r11, r5, r4 @buf2[5] = s3 + r4
|
|
ADD r12, r12, r8 @buf2[6] = r3 + s4
|
|
SUB r4, r5, r4 @buf2[7] = s3 - r4
|
|
SUB r5, r1, r7 @buf2[8] = r1 - s2
|
|
ADD r6, r6, r2 @buf2[9] = s1 + r2
|
|
MOVW r1, #0xFA00
|
|
MOVT r1, #0xFFFF
|
|
|
|
STMIA lr!, {r3, r9-r12}
|
|
MOV r12, #384 @
|
|
STMIA lr!, {r4-r6} @
|
|
|
|
LDRD r2, [r0] @ r2 = buf1a[0] and r3 = buf1a[1]
|
|
ADD r0, r0, r12
|
|
LDRD r4, [r0] @ r4 = buf1a[2] and r5 = buf1a[3]
|
|
ADD r0, r0, r1
|
|
|
|
LDRD r6, [r0] @ r6 = buf1a[4] and r7 = buf1a[5]
|
|
ADD r0, r0, r12
|
|
LDRD r8, [r0] @ r8 = buf1a[6] and r9 = buf1a[7]
|
|
ADD r0, r0, r12
|
|
LDRD r10, [r0] @ r10 = buf1a[8] and r11 = buf1a[9]
|
|
ADD r0, r0, r12
|
|
|
|
ADD r1, r4, r10 @ r1 = buf1a[2] + buf1a[8]
|
|
SUB r4, r4, r10 @ r4 = buf1a[2] - buf1a[8]@
|
|
MOVW r10, #0xB000
|
|
MOVT r10, #0x478E
|
|
ADD r12, r6, r8 @ r3 = buf1a[4] + buf1a[6]
|
|
SUB r8, r6, r8 @ r2 = buf1a[4] - buf1a[6]
|
|
|
|
SUB r6, r1, r12 @ (r1 - r3)
|
|
SMULWT r6, r6, r10 @ t = mult32x16in32_shl((r1 - r3), C54)
|
|
ADD r1, r1, r12 @ r1 = r1 + r3@
|
|
ADD r2, r2, r1 @ temp1 = inp[0] + r1@
|
|
SMULWB r1, r1, r10 @ mult32_shl(r1, C55)
|
|
ADD r1, r2, r1, lsl #2 @ r1 = temp1 + ((mult32_shl(r1, C55)) << 1)@
|
|
MOVW r10, #0x9D84
|
|
MOVT r10, #0x79BC
|
|
STR r2, [lr], #4 @ *buf2++ = temp1@
|
|
|
|
SUB r12, r1, r6, LSL #1 @ r3 = r1 - t@
|
|
ADD r1, r1, r6, LSL #1 @ r1 = r1 + t@
|
|
|
|
ADD r2, r4, r8 @ (r4 + r2)
|
|
SMULWT r2, r2, r10 @ t = mult32_shl((r4 + r2), C51)@
|
|
@LSL r2, r2, #1
|
|
MOV r2, r2, LSL #1
|
|
|
|
SMULWB r4, r4, r10 @ mult32_shl(r4, C52)
|
|
MOVW r10, #0xD180
|
|
MOVT r10, #0xFFFF
|
|
ADD r4, r2, r4, LSL #2 @ r4 = t + (mult32_shl(r4, C52) << 1)@
|
|
|
|
SMULWB r8, r8, r10 @ mult32_shl(r2, C53)
|
|
ADD r2, r2, r8, LSL #1 @ r2 = t + mult32_shl(r2, C53)@
|
|
|
|
ADD r6, r5, r11 @ s1 = buf1a[3] + buf1a[9]
|
|
SUB r8, r5, r11 @ s4 = buf1a[3] - buf1a[9]
|
|
MOVW r10, #0xB000
|
|
MOVT r10, #0x478E
|
|
ADD r5, r7, r9 @ s3 = buf1a[5] + buf1a[7]@
|
|
SUB r7, r7, r9 @ s2 = buf1a[5] + buf1a[7]@
|
|
|
|
SUB r9, r6, r5 @ (s1 - s3)
|
|
SMULWT r9, r9, r10 @ t = mult32x16in32_shl((s1 - s3), C54)
|
|
ADD r6, r6, r5 @ s1 = s1 + s3@
|
|
ADD r3, r3, r6 @ temp2 = buf1a[1] + s1
|
|
SMULWB r6, r6, r10 @ mult32_shl(s1, C55)
|
|
ADD r6, r3, r6, lsl #2 @ s1 = temp1 + ((mult32_shl(s1, C55)) << 1)@
|
|
MOVW r10, #0x9D84
|
|
MOVT r10, #0x79BC
|
|
STR r3, [lr], #4 @ *buf2++ = temp2@
|
|
|
|
SUB r5, r6, r9, LSL #1 @ s3 = s1 - t@
|
|
ADD r6, r6, r9, LSL #1 @ s1 = s1 + t@
|
|
|
|
ADD r11, r7, r8 @ (s4 + s2)
|
|
SMULWT r11, r11, r10 @ t = mult32_shl((s4 + s2), C51)@
|
|
@LSL r11, r11, #1 @
|
|
MOV r11, r11, LSL #1
|
|
|
|
SMULWB r8, r8, r10 @mult32_shl(s4, C52)
|
|
MOVW r10, #0xD180
|
|
MOVT r10, #0xFFFF
|
|
ADD r8, r11, r8, LSL #2 @s4 = t + (mult32_shl(s4, C52) << 1)@
|
|
|
|
|
|
SMULWB r7, r7, r10 @mult32_shl(s2, C53)
|
|
ADD r7, r11, r7, LSL #1 @s2 = t + mult32_shl(s2, C53)@
|
|
|
|
ADD r3, r1, r7 @buf2[2] = r1 + s2
|
|
SUB r9, r6, r2 @buf2[3] = s1 - r2
|
|
SUB r10, r12, r8 @buf2[4] = r3 - s4
|
|
ADD r11, r5, r4 @buf2[5] = s3 + r4
|
|
ADD r12, r12, r8 @buf2[6] = r3 + s4
|
|
SUB r4, r5, r4 @buf2[7] = s3 - r4
|
|
SUB r5, r1, r7 @buf2[8] = r1 - s2
|
|
ADD r6, r6, r2 @buf2[9] = s1 + r2
|
|
|
|
STMIA lr!, {r3, r9-r12}
|
|
STMIA lr!, {r4-r6} @
|
|
|
|
SUB lr, lr, #120 @
|
|
MOVW r12, # 28378 @
|
|
LDMFD r13!, {r10, r11} @
|
|
|
|
|
|
LOOP_FFT3:
|
|
LDRD r0, [lr] @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
|
|
LDRD r2, [lr, #40] @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
|
|
LDRD r4, [lr, #80] @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
|
|
ADD lr, lr, #8 @
|
|
|
|
ADD r6, r0, r2 @ X01r = add32(buf1[0], buf1[2])
|
|
ADD r7, r1, r3 @ X01i = add32(buf1[1], buf1[3])
|
|
|
|
ADD r8, r2, r4 @ add_r = add32(buf1[2], buf1[4])
|
|
ADD r9, r3, r5 @ add_i = add32(buf1[3], buf1[5])
|
|
|
|
SUB r2, r2, r4 @ sub_r = sub32(buf1[2], buf1[4])@
|
|
SUB r3, r3, r5 @ sub_i = sub32(buf1[3], buf1[5])@
|
|
|
|
@ASR r8, r8, #1 @ p1 = add_r >> 1@
|
|
MOV r8, r8, ASR #1
|
|
|
|
@ASR r9, r9, #1 @ p4 = add_i >> 1@
|
|
MOV r9, r9, ASR #1
|
|
|
|
SMULWB r3, r3, r12 @ p2 = mult32x16in32_shl(sub_i, sinmu)@
|
|
SMULWB r2, r2, r12 @ p3 = mult32x16in32_shl(sub_r, sinmu)@
|
|
|
|
SUB r0, r0, r8 @ temp = sub32(buf1a[0], p1)@
|
|
ADD r8, r1, r2, LSL #1 @ temp1 = add32(buf1a[1], p3)@
|
|
SUB r2, r1, r2, LSL #1 @ temp2 = sub32(buf1a[1], p3)@
|
|
|
|
ADD r4, r6, r4 @ add32(X01r, buf1a[4])@
|
|
ADD r5, r7, r5 @ add32(X01i, buf1a[5])@
|
|
ADD r6, r0, r3, LSL #1 @ add32(temp, p2)@
|
|
SUB r7, r2, r9 @ sub32(temp2, p4)@
|
|
SUB r9, r8, r9 @ sub32(temp1, p4)@
|
|
SUB r8, r0, r3, LSL #1 @ sub32(temp, p2)@
|
|
|
|
MOV r3, r11 @
|
|
LDRB r0, [r10], #1 @
|
|
LDRB r1, [r10], #1 @
|
|
LDRB r2, [r10], #1 @
|
|
ADD r0, r11, r0, lsl #3 @
|
|
ADD r1, r11, r1, lsl #3 @
|
|
ADD r2, r11, r2, lsl #3 @
|
|
STRD r4, [r0] @
|
|
STRD r6, [r1] @
|
|
STRD r8, [r2] @
|
|
|
|
LDRD r0, [lr] @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
|
|
LDRD r2, [lr, #40] @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
|
|
LDRD r4, [lr, #80] @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
|
|
ADD lr, lr, #8 @
|
|
|
|
|
|
ADD r6, r0, r2 @ X01r = add32(buf1[0], buf1[2])
|
|
ADD r7, r1, r3 @ X01i = add32(buf1[1], buf1[3])
|
|
|
|
ADD r8, r2, r4 @ add_r = add32(buf1[2], buf1[4])
|
|
ADD r9, r3, r5 @ add_i = add32(buf1[3], buf1[5])
|
|
|
|
SUB r2, r2, r4 @ sub_r = sub32(buf1[2], buf1[4])@
|
|
SUB r3, r3, r5 @ sub_i = sub32(buf1[3], buf1[5])@
|
|
|
|
@ASR r8, r8, #1 @ p1 = add_r >> 1@
|
|
MOV r8, r8, ASR #1
|
|
@ASR r9, r9, #1 @ p4 = add_i >> 1@
|
|
MOV r9, r9, ASR #1
|
|
|
|
SMULWB r3, r3, r12 @ p2 = mult32x16in32_shl(sub_i, sinmu)@
|
|
SMULWB r2, r2, r12 @ p3 = mult32x16in32_shl(sub_r, sinmu)@
|
|
|
|
SUB r0, r0, r8 @ temp = sub32(buf1a[0], p1)@
|
|
ADD r8, r1, r2, LSL #1 @ temp1 = add32(buf1a[1], p3)@
|
|
SUB r2, r1, r2, LSL #1 @ temp2 = sub32(buf1a[1], p3)@
|
|
|
|
ADD r4, r6, r4 @ add32(X01r, buf1a[4])@
|
|
ADD r5, r7, r5 @ add32(X01i, buf1a[5])@
|
|
ADD r6, r0, r3, LSL #1 @ add32(temp, p2)@
|
|
SUB r7, r2, r9 @ sub32(temp2, p4)@
|
|
SUB r9, r8, r9 @ sub32(temp1, p4)@
|
|
SUB r8, r0, r3, LSL #1 @ sub32(temp, p2)@
|
|
|
|
LDRB r0, [r10], #1 @
|
|
LDRB r1, [r10], #1 @
|
|
LDRB r2, [r10], #1 @
|
|
ADD r0, r11, r0, lsl #3 @
|
|
ADD r1, r11, r1, lsl #3 @
|
|
ADD r2, r11, r2, lsl #3 @
|
|
STRD r4, [r0] @
|
|
STRD r6, [r1] @
|
|
STRD r8, [r2] @
|
|
|
|
LDRD r0, [lr] @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
|
|
LDRD r2, [lr, #40] @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
|
|
LDRD r4, [lr, #80] @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
|
|
ADD lr, lr, #8 @
|
|
|
|
|
|
ADD r6, r0, r2 @ X01r = add32(buf1[0], buf1[2])
|
|
ADD r7, r1, r3 @ X01i = add32(buf1[1], buf1[3])
|
|
|
|
ADD r8, r2, r4 @ add_r = add32(buf1[2], buf1[4])
|
|
ADD r9, r3, r5 @ add_i = add32(buf1[3], buf1[5])
|
|
|
|
SUB r2, r2, r4 @ sub_r = sub32(buf1[2], buf1[4])@
|
|
SUB r3, r3, r5 @ sub_i = sub32(buf1[3], buf1[5])@
|
|
|
|
|
|
@ASR r8, r8, #1 @ p1 = add_r >> 1@
|
|
MOV r8, r8, ASR #1
|
|
@ASR r9, r9, #1 @ p4 = add_i >> 1@
|
|
MOV r9, r9, ASR #1
|
|
|
|
SMULWB r3, r3, r12 @ p2 = mult32x16in32_shl(sub_i, sinmu)@
|
|
SMULWB r2, r2, r12 @ p3 = mult32x16in32_shl(sub_r, sinmu)@
|
|
|
|
SUB r0, r0, r8 @ temp = sub32(buf1a[0], p1)@
|
|
ADD r8, r1, r2, LSL #1 @ temp1 = add32(buf1a[1], p3)@
|
|
SUB r2, r1, r2, LSL #1 @ temp2 = sub32(buf1a[1], p3)@
|
|
|
|
ADD r4, r6, r4 @ add32(X01r, buf1a[4])@
|
|
ADD r5, r7, r5 @ add32(X01i, buf1a[5])@
|
|
ADD r6, r0, r3, LSL #1 @ add32(temp, p2)@
|
|
SUB r7, r2, r9 @ sub32(temp2, p4)@
|
|
SUB r9, r8, r9 @ sub32(temp1, p4)@
|
|
SUB r8, r0, r3, LSL #1 @ sub32(temp, p2)@
|
|
|
|
LDRB r0, [r10], #1 @
|
|
LDRB r1, [r10], #1 @
|
|
LDRB r2, [r10], #1 @
|
|
ADD r0, r11, r0, lsl #3 @
|
|
ADD r1, r11, r1, lsl #3 @
|
|
ADD r2, r11, r2, lsl #3 @
|
|
STRD r4, [r0] @
|
|
STRD r6, [r1] @
|
|
STRD r8, [r2] @
|
|
|
|
LDRD r0, [lr] @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
|
|
LDRD r2, [lr, #40] @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
|
|
LDRD r4, [lr, #80] @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
|
|
ADD lr, lr, #8 @
|
|
|
|
ADD r6, r0, r2 @ X01r = add32(buf1[0], buf1[2])
|
|
ADD r7, r1, r3 @ X01i = add32(buf1[1], buf1[3])
|
|
|
|
ADD r8, r2, r4 @ add_r = add32(buf1[2], buf1[4])
|
|
ADD r9, r3, r5 @ add_i = add32(buf1[3], buf1[5])
|
|
|
|
SUB r2, r2, r4 @ sub_r = sub32(buf1[2], buf1[4])@
|
|
SUB r3, r3, r5 @ sub_i = sub32(buf1[3], buf1[5])@
|
|
|
|
@ASR r8, r8, #1 @ p1 = add_r >> 1@
|
|
MOV r8, r8, ASR #1
|
|
@ASR r9, r9, #1 @ p4 = add_i >> 1@
|
|
MOV r9, r9, ASR #1
|
|
|
|
SMULWB r3, r3, r12 @ p2 = mult32x16in32_shl(sub_i, sinmu)@
|
|
SMULWB r2, r2, r12 @ p3 = mult32x16in32_shl(sub_r, sinmu)@
|
|
|
|
SUB r0, r0, r8 @ temp = sub32(buf1a[0], p1)@
|
|
ADD r8, r1, r2, LSL #1 @ temp1 = add32(buf1a[1], p3)@
|
|
SUB r2, r1, r2, LSL #1 @ temp2 = sub32(buf1a[1], p3)@
|
|
|
|
ADD r4, r6, r4 @ add32(X01r, buf1a[4])@
|
|
ADD r5, r7, r5 @ add32(X01i, buf1a[5])@
|
|
ADD r6, r0, r3, LSL #1 @ add32(temp, p2)@
|
|
SUB r7, r2, r9 @ sub32(temp2, p4)@
|
|
SUB r9, r8, r9 @ sub32(temp1, p4)@
|
|
SUB r8, r0, r3, LSL #1 @ sub32(temp, p2)@
|
|
|
|
LDRB r0, [r10], #1 @
|
|
LDRB r1, [r10], #1 @
|
|
LDRB r2, [r10], #1 @
|
|
ADD r0, r11, r0, lsl #3 @
|
|
ADD r1, r11, r1, lsl #3 @
|
|
ADD r2, r11, r2, lsl #3 @
|
|
STRD r4, [r0] @
|
|
STRD r6, [r1] @
|
|
STRD r8, [r2] @
|
|
|
|
LDRD r0, [lr] @ r0 = fft3outptr[0] and r1 = fft3outptr[1]
|
|
LDRD r2, [lr, #40] @ r2 = fft3outptr[10] and r3 = fft3outptr[11]
|
|
LDRD r4, [lr, #80] @ r4 = fft3outptr[20] and r5 = fft3outptr[21]
|
|
|
|
ADD r6, r0, r2 @ X01r = add32(buf1[0], buf1[2])
|
|
ADD r7, r1, r3 @ X01i = add32(buf1[1], buf1[3])
|
|
|
|
ADD r8, r2, r4 @ add_r = add32(buf1[2], buf1[4])
|
|
ADD r9, r3, r5 @ add_i = add32(buf1[3], buf1[5])
|
|
|
|
SUB r2, r2, r4 @ sub_r = sub32(buf1[2], buf1[4])@
|
|
SUB r3, r3, r5 @ sub_i = sub32(buf1[3], buf1[5])@
|
|
|
|
@ASR r8, r8, #1 @ p1 = add_r >> 1@
|
|
MOV r8, r8, ASR #1
|
|
@ASR r9, r9, #1 @ p4 = add_i >> 1@
|
|
MOV r9, r9, ASR #1
|
|
|
|
SMULWB r3, r3, r12 @ p2 = mult32x16in32_shl(sub_i, sinmu)@
|
|
SMULWB r2, r2, r12 @ p3 = mult32x16in32_shl(sub_r, sinmu)@
|
|
|
|
SUB r0, r0, r8 @ temp = sub32(buf1a[0], p1)@
|
|
ADD r8, r1, r2, LSL #1 @ temp1 = add32(buf1a[1], p3)@
|
|
SUB r2, r1, r2, LSL #1 @ temp2 = sub32(buf1a[1], p3)@
|
|
|
|
ADD r4, r6, r4 @ add32(X01r, buf1a[4])@
|
|
ADD r5, r7, r5 @ add32(X01i, buf1a[5])@
|
|
ADD r6, r0, r3, LSL #1 @ add32(temp, p2)@
|
|
SUB r7, r2, r9 @ sub32(temp2, p4)@
|
|
SUB r9, r8, r9 @ sub32(temp1, p4)@
|
|
SUB r8, r0, r3, LSL #1 @ sub32(temp, p2)@
|
|
|
|
LDRB r0, [r10], #1 @
|
|
LDRB r1, [r10], #1 @
|
|
LDRB r2, [r10], #1 @
|
|
ADD r0, r11, r0, lsl #3 @
|
|
ADD r1, r11, r1, lsl #3 @
|
|
ADD r2, r11, r2, lsl #3 @
|
|
STRD r4, [r0] @
|
|
STRD r6, [r1] @
|
|
STRD r8, [r2] @
|
|
|
|
LDMFD r13!, {r4 - r12, r15}
|
|
|
|
|