You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
159 lines
4.3 KiB
159 lines
4.3 KiB
4 months ago
|
/******************************************************************************
|
||
|
*
|
||
|
* Copyright (C) 2015 The Android Open Source Project
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at:
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*
|
||
|
*****************************************************************************
|
||
|
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
||
|
*/
|
||
|
/**
|
||
|
*******************************************************************************
|
||
|
* @file
|
||
|
* icv_variance_sse42.c
|
||
|
*
|
||
|
* @brief
|
||
|
* This file contains the functions to compute variance
|
||
|
*
|
||
|
* @author
|
||
|
* Ittiam
|
||
|
*
|
||
|
* @par List of Functions:
|
||
|
* icv_variance_8x4_ssse3()
|
||
|
*
|
||
|
* @remarks
|
||
|
* None
|
||
|
*
|
||
|
*******************************************************************************
|
||
|
*/
|
||
|
/*****************************************************************************/
|
||
|
/* File Includes */
|
||
|
/*****************************************************************************/
|
||
|
/* System include files */
|
||
|
#include <stdio.h>
|
||
|
#include <stdint.h>
|
||
|
#include <string.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <assert.h>
|
||
|
#include <immintrin.h>
|
||
|
|
||
|
/* User include files */
|
||
|
#include "icv_datatypes.h"
|
||
|
#include "icv_macros.h"
|
||
|
#include "icv_platform_macros.h"
|
||
|
#include "icv.h"
|
||
|
|
||
|
/**
|
||
|
*******************************************************************************
|
||
|
*
|
||
|
* @brief
|
||
|
* Computes variance of a given 8x4 block
|
||
|
*
|
||
|
* @par Description
|
||
|
* Compute variance of a given 8x4 block
|
||
|
*
|
||
|
* @param[in] pu1_src
|
||
|
* Source
|
||
|
*
|
||
|
* @param[in] src_strd
|
||
|
* Source stride
|
||
|
*
|
||
|
* @param[in] wd
|
||
|
* Assumed to be 8
|
||
|
*
|
||
|
* @param[in] ht
|
||
|
* Assumed to be 4
|
||
|
*
|
||
|
* @returns
|
||
|
* Variance
|
||
|
*
|
||
|
* @remarks
|
||
|
*
|
||
|
*******************************************************************************
|
||
|
*/
|
||
|
WORD32 icv_variance_8x4_ssse3(UWORD8 *pu1_src, WORD32 src_strd, WORD32 wd, WORD32 ht)
|
||
|
{
|
||
|
WORD32 sum;
|
||
|
WORD32 sum_sqr;
|
||
|
WORD32 blk_sz;
|
||
|
WORD32 vrnc;
|
||
|
__m128 src_r0, src_r1;
|
||
|
__m128i ssrc_r0, ssrc_r1, ssrc_r2, ssrc_r3;
|
||
|
__m128i sum_r0, sum_r1;
|
||
|
__m128i sqr_r0, sqr_r1, sqr_r2, sqr_r3;
|
||
|
__m128i vsum, vsum_sqr;
|
||
|
__m128i zero;
|
||
|
UNUSED(wd);
|
||
|
UNUSED(ht);
|
||
|
|
||
|
ASSERT(wd == 8);
|
||
|
ASSERT(ht == 4);
|
||
|
|
||
|
sum = 0;
|
||
|
sum_sqr = 0;
|
||
|
|
||
|
blk_sz = 8 * 4;
|
||
|
|
||
|
zero = _mm_setzero_si128();
|
||
|
|
||
|
/* Load source */
|
||
|
src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
|
||
|
pu1_src += src_strd;
|
||
|
|
||
|
src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
|
||
|
pu1_src += src_strd;
|
||
|
|
||
|
src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src));
|
||
|
pu1_src += src_strd;
|
||
|
|
||
|
src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src));
|
||
|
pu1_src += src_strd;
|
||
|
|
||
|
/* Compute sum of all elements */
|
||
|
/* Use SAD with 0, since there is no pairwise addition */
|
||
|
sum_r0 = _mm_sad_epu8((__m128i)src_r0, zero);
|
||
|
sum_r1 = _mm_sad_epu8((__m128i)src_r1, zero);
|
||
|
|
||
|
/* Accumulate SAD */
|
||
|
vsum = _mm_add_epi64(sum_r0, sum_r1);
|
||
|
vsum = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8));
|
||
|
|
||
|
sum = _mm_cvtsi128_si32(vsum);
|
||
|
|
||
|
/* Unpack to 16 bits */
|
||
|
ssrc_r0 = _mm_unpacklo_epi8((__m128i)src_r0, zero);
|
||
|
ssrc_r1 = _mm_unpacklo_epi8((__m128i)src_r1, zero);
|
||
|
ssrc_r2 = _mm_unpackhi_epi8((__m128i)src_r0, zero);
|
||
|
ssrc_r3 = _mm_unpackhi_epi8((__m128i)src_r1, zero);
|
||
|
|
||
|
/* Compute sum of squares */
|
||
|
sqr_r0 = _mm_madd_epi16(ssrc_r0, ssrc_r0);
|
||
|
sqr_r1 = _mm_madd_epi16(ssrc_r1, ssrc_r1);
|
||
|
sqr_r2 = _mm_madd_epi16(ssrc_r2, ssrc_r2);
|
||
|
sqr_r3 = _mm_madd_epi16(ssrc_r3, ssrc_r3);
|
||
|
|
||
|
vsum_sqr = _mm_add_epi32(sqr_r0, sqr_r1);
|
||
|
vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r2);
|
||
|
vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r3);
|
||
|
|
||
|
vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 8));
|
||
|
vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 4));
|
||
|
sum_sqr = _mm_cvtsi128_si32(vsum_sqr);
|
||
|
|
||
|
/* Compute variance */
|
||
|
vrnc = ((sum_sqr * blk_sz) - (sum * sum)) / (blk_sz * blk_sz);
|
||
|
|
||
|
return vrnc;
|
||
|
}
|
||
|
|