You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

663 lines
16 KiB

/*
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
*/
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include "igt_core.h"
#include "igt_stats.h"
#define U64_MAX ((uint64_t)~0ULL)
#define sorted_value(stats, i) (stats->is_float ? stats->sorted_f[i] : stats->sorted_u64[i])
#define unsorted_value(stats, i) (stats->is_float ? stats->values_f[i] : stats->values_u64[i])
/**
* SECTION:igt_stats
* @short_description: Tools for statistical analysis
* @title: Stats
* @include: igt.h
*
* Various tools to make sense of data.
*
* #igt_stats_t is a container of data samples. igt_stats_push() is used to add
* new samples and various results (mean, variance, standard deviation, ...)
* can then be retrieved.
*
* |[
* igt_stats_t stats;
*
* igt_stats_init(&stats, 8);
*
* igt_stats_push(&stats, 2);
* igt_stats_push(&stats, 4);
* igt_stats_push(&stats, 4);
* igt_stats_push(&stats, 4);
* igt_stats_push(&stats, 5);
* igt_stats_push(&stats, 5);
* igt_stats_push(&stats, 7);
* igt_stats_push(&stats, 9);
*
* printf("Mean: %lf\n", igt_stats_get_mean(&stats));
*
* igt_stats_fini(&stats);
* ]|
*/
static unsigned int get_new_capacity(int need)
{
unsigned int new_capacity;
/* taken from Python's list */
new_capacity = (need >> 6) + (need < 9 ? 3 : 6);
new_capacity += need;
return new_capacity;
}
static void igt_stats_ensure_capacity(igt_stats_t *stats,
unsigned int n_additional_values)
{
unsigned int new_n_values = stats->n_values + n_additional_values;
unsigned int new_capacity;
if (new_n_values <= stats->capacity)
return;
new_capacity = get_new_capacity(new_n_values);
stats->values_u64 = realloc(stats->values_u64,
sizeof(*stats->values_u64) * new_capacity);
igt_assert(stats->values_u64);
stats->capacity = new_capacity;
free(stats->sorted_u64);
stats->sorted_u64 = NULL;
}
/**
* igt_stats_init:
* @stats: An #igt_stats_t instance
*
* Initializes an #igt_stats_t instance. igt_stats_fini() must be called once
* finished with @stats.
*/
void igt_stats_init(igt_stats_t *stats)
{
memset(stats, 0, sizeof(*stats));
igt_stats_ensure_capacity(stats, 128);
stats->min = U64_MAX;
stats->max = 0;
}
/**
* igt_stats_init_with_size:
* @stats: An #igt_stats_t instance
* @capacity: Number of data samples @stats can contain
*
* Like igt_stats_init() but with a size to avoid reallocating the underlying
* array(s) when pushing new values. Useful if we have a good idea of the
* number of data points we want @stats to hold.
*
* igt_stats_fini() must be called once finished with @stats.
*/
void igt_stats_init_with_size(igt_stats_t *stats, unsigned int capacity)
{
memset(stats, 0, sizeof(*stats));
igt_stats_ensure_capacity(stats, capacity);
stats->min = U64_MAX;
stats->max = 0;
stats->range[0] = HUGE_VAL;
stats->range[1] = -HUGE_VAL;
}
/**
* igt_stats_fini:
* @stats: An #igt_stats_t instance
*
* Frees resources allocated in igt_stats_init().
*/
void igt_stats_fini(igt_stats_t *stats)
{
free(stats->values_u64);
free(stats->sorted_u64);
}
/**
* igt_stats_is_population:
* @stats: An #igt_stats_t instance
*
* Returns: #true if @stats represents a population, #false if only a sample.
*
* See igt_stats_set_population() for more details.
*/
bool igt_stats_is_population(igt_stats_t *stats)
{
return stats->is_population;
}
/**
* igt_stats_set_population:
* @stats: An #igt_stats_t instance
* @full_population: Whether we're dealing with sample data or a full
* population
*
* In statistics, we usually deal with a subset of the full data (which may be
* a continuous or infinite set). Data analysis is then done on a sample of
* this population.
*
* This has some importance as only having a sample of the data leads to
* [biased estimators](https://en.wikipedia.org/wiki/Bias_of_an_estimator). We
* currently used the information given by this method to apply
* [Bessel's correction](https://en.wikipedia.org/wiki/Bessel%27s_correction)
* to the variance.
*
* Note that even if we manage to have an unbiased variance by multiplying
* a sample variance by the Bessel's correction, n/(n - 1), the standard
* deviation derived from the unbiased variance isn't itself unbiased.
* Statisticians talk about a "corrected" standard deviation.
*
* When giving #true to this function, the data set in @stats is considered a
* full population. It's considered a sample of a bigger population otherwise.
*
* When newly created, @stats defaults to holding sample data.
*/
void igt_stats_set_population(igt_stats_t *stats, bool full_population)
{
if (full_population == stats->is_population)
return;
stats->is_population = full_population;
stats->mean_variance_valid = false;
}
/**
* igt_stats_push:
* @stats: An #igt_stats_t instance
* @value: An integer value
*
* Adds a new value to the @stats dataset.
*/
void igt_stats_push(igt_stats_t *stats, uint64_t value)
{
if (stats->is_float) {
igt_stats_push_float(stats, value);
return;
}
igt_stats_ensure_capacity(stats, 1);
stats->values_u64[stats->n_values++] = value;
stats->mean_variance_valid = false;
stats->sorted_array_valid = false;
if (value < stats->min)
stats->min = value;
if (value > stats->max)
stats->max = value;
}
/**
* igt_stats_push:
* @stats: An #igt_stats_t instance
* @value: An floating point
*
* Adds a new value to the @stats dataset and converts the igt_stats from
* an integer collection to a floating point one.
*/
void igt_stats_push_float(igt_stats_t *stats, double value)
{
igt_stats_ensure_capacity(stats, 1);
if (!stats->is_float) {
int n;
for (n = 0; n < stats->n_values; n++)
stats->values_f[n] = stats->values_u64[n];
stats->is_float = true;
}
stats->values_f[stats->n_values++] = value;
stats->mean_variance_valid = false;
stats->sorted_array_valid = false;
if (value < stats->range[0])
stats->range[0] = value;
if (value > stats->range[1])
stats->range[1] = value;
}
/**
* igt_stats_push_array:
* @stats: An #igt_stats_t instance
* @values: (array length=n_values): A pointer to an array of data points
* @n_values: The number of data points to add
*
* Adds an array of values to the @stats dataset.
*/
void igt_stats_push_array(igt_stats_t *stats,
const uint64_t *values, unsigned int n_values)
{
unsigned int i;
igt_stats_ensure_capacity(stats, n_values);
for (i = 0; i < n_values; i++)
igt_stats_push(stats, values[i]);
}
/**
* igt_stats_get_min:
* @stats: An #igt_stats_t instance
*
* Retrieves the minimal value in @stats
*/
uint64_t igt_stats_get_min(igt_stats_t *stats)
{
igt_assert(!stats->is_float);
return stats->min;
}
/**
* igt_stats_get_max:
* @stats: An #igt_stats_t instance
*
* Retrieves the maximum value in @stats
*/
uint64_t igt_stats_get_max(igt_stats_t *stats)
{
igt_assert(!stats->is_float);
return stats->max;
}
/**
* igt_stats_get_range:
* @stats: An #igt_stats_t instance
*
* Retrieves the range of the values in @stats. The range is the difference
* between the highest and the lowest value.
*
* The range can be a deceiving characterization of the values, because there
* can be extreme minimal and maximum values that are just anomalies. Prefer
* the interquatile range (see igt_stats_get_iqr()) or an histogram.
*/
uint64_t igt_stats_get_range(igt_stats_t *stats)
{
return igt_stats_get_max(stats) - igt_stats_get_min(stats);
}
static int cmp_u64(const void *pa, const void *pb)
{
const uint64_t *a = pa, *b = pb;
if (*a < *b)
return -1;
if (*a > *b)
return 1;
return 0;
}
static int cmp_f(const void *pa, const void *pb)
{
const double *a = pa, *b = pb;
if (*a < *b)
return -1;
if (*a > *b)
return 1;
return 0;
}
static void igt_stats_ensure_sorted_values(igt_stats_t *stats)
{
if (stats->sorted_array_valid)
return;
if (!stats->sorted_u64) {
/*
* igt_stats_ensure_capacity() will free ->sorted when the
* capacity increases, which also correspond to an invalidation
* of the sorted array. We'll then reallocate it here on
* demand.
*/
stats->sorted_u64 = calloc(stats->capacity,
sizeof(*stats->values_u64));
igt_assert(stats->sorted_u64);
}
memcpy(stats->sorted_u64, stats->values_u64,
sizeof(*stats->values_u64) * stats->n_values);
qsort(stats->sorted_u64, stats->n_values, sizeof(*stats->values_u64),
stats->is_float ? cmp_f : cmp_u64);
stats->sorted_array_valid = true;
}
/*
* We use Tukey's hinge for our quartiles determination.
* ends (end, lower_end) are exclusive.
*/
static double
igt_stats_get_median_internal(igt_stats_t *stats,
unsigned int start, unsigned int end,
unsigned int *lower_end /* out */,
unsigned int *upper_start /* out */)
{
unsigned int mid, n_values = end - start;
double median;
igt_stats_ensure_sorted_values(stats);
/* odd number of data points */
if (n_values % 2 == 1) {
/* median is the value in the middle (actual datum) */
mid = start + n_values / 2;
median = sorted_value(stats, mid);
/* the two halves contain the median value */
if (lower_end)
*lower_end = mid + 1;
if (upper_start)
*upper_start = mid;
/* even number of data points */
} else {
/*
* The middle is in between two indexes, 'mid' points at the
* lower one. The median is then the average between those two
* values.
*/
mid = start + n_values / 2 - 1;
median = (sorted_value(stats, mid) + sorted_value(stats, mid+1))/2.;
if (lower_end)
*lower_end = mid + 1;
if (upper_start)
*upper_start = mid + 1;
}
return median;
}
/**
* igt_stats_get_quartiles:
* @stats: An #igt_stats_t instance
* @q1: (out): lower or 25th quartile
* @q2: (out): median or 50th quartile
* @q3: (out): upper or 75th quartile
*
* Retrieves the [quartiles](https://en.wikipedia.org/wiki/Quartile) of the
* @stats dataset.
*/
void igt_stats_get_quartiles(igt_stats_t *stats,
double *q1, double *q2, double *q3)
{
unsigned int lower_end, upper_start;
double ret;
if (stats->n_values < 3) {
if (q1)
*q1 = 0.;
if (q2)
*q2 = 0.;
if (q3)
*q3 = 0.;
return;
}
ret = igt_stats_get_median_internal(stats, 0, stats->n_values,
&lower_end, &upper_start);
if (q2)
*q2 = ret;
ret = igt_stats_get_median_internal(stats, 0, lower_end, NULL, NULL);
if (q1)
*q1 = ret;
ret = igt_stats_get_median_internal(stats, upper_start, stats->n_values,
NULL, NULL);
if (q3)
*q3 = ret;
}
/**
* igt_stats_get_iqr:
* @stats: An #igt_stats_t instance
*
* Retrieves the
* [interquartile range](https://en.wikipedia.org/wiki/Interquartile_range)
* (IQR) of the @stats dataset.
*/
double igt_stats_get_iqr(igt_stats_t *stats)
{
double q1, q3;
igt_stats_get_quartiles(stats, &q1, NULL, &q3);
return (q3 - q1);
}
/**
* igt_stats_get_median:
* @stats: An #igt_stats_t instance
*
* Retrieves the median of the @stats dataset.
*/
double igt_stats_get_median(igt_stats_t *stats)
{
return igt_stats_get_median_internal(stats, 0, stats->n_values,
NULL, NULL);
}
/*
* Algorithm popularised by Knuth in:
*
* The Art of Computer Programming, volume 2: Seminumerical Algorithms,
* 3rd edn., p. 232. Boston: Addison-Wesley
*
* Source: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
*/
static void igt_stats_knuth_mean_variance(igt_stats_t *stats)
{
double mean = 0., m2 = 0.;
unsigned int i;
if (stats->mean_variance_valid)
return;
for (i = 0; i < stats->n_values; i++) {
double delta = unsorted_value(stats, i) - mean;
mean += delta / (i + 1);
m2 += delta * (unsorted_value(stats, i) - mean);
}
stats->mean = mean;
if (stats->n_values > 1 && !stats->is_population)
stats->variance = m2 / (stats->n_values - 1);
else
stats->variance = m2 / stats->n_values;
stats->mean_variance_valid = true;
}
/**
* igt_stats_get_mean:
* @stats: An #igt_stats_t instance
*
* Retrieves the mean of the @stats dataset.
*/
double igt_stats_get_mean(igt_stats_t *stats)
{
igt_stats_knuth_mean_variance(stats);
return stats->mean;
}
/**
* igt_stats_get_variance:
* @stats: An #igt_stats_t instance
*
* Retrieves the variance of the @stats dataset.
*/
double igt_stats_get_variance(igt_stats_t *stats)
{
igt_stats_knuth_mean_variance(stats);
return stats->variance;
}
/**
* igt_stats_get_std_deviation:
* @stats: An #igt_stats_t instance
*
* Retrieves the standard deviation of the @stats dataset.
*/
double igt_stats_get_std_deviation(igt_stats_t *stats)
{
igt_stats_knuth_mean_variance(stats);
return sqrt(stats->variance);
}
/**
* igt_stats_get_iqm:
* @stats: An #igt_stats_t instance
*
* Retrieves the
* [interquartile mean](https://en.wikipedia.org/wiki/Interquartile_mean) (IQM)
* of the @stats dataset.
*
* The interquartile mean is a "statistical measure of central tendency".
* It is a truncated mean that discards the lowest and highest 25% of values,
* and calculates the mean value of the remaining central values.
*
* It's useful to hide outliers in measurements (due to cold cache etc).
*/
double igt_stats_get_iqm(igt_stats_t *stats)
{
unsigned int q1, q3, i;
double mean;
igt_stats_ensure_sorted_values(stats);
q1 = (stats->n_values + 3) / 4;
q3 = 3 * stats->n_values / 4;
mean = 0;
for (i = 0; i <= q3 - q1; i++)
mean += (sorted_value(stats, q1 + i) - mean) / (i + 1);
if (stats->n_values % 4) {
double rem = .5 * (stats->n_values % 4) / 4;
q1 = (stats->n_values) / 4;
q3 = (3 * stats->n_values + 3) / 4;
mean += rem * (sorted_value(stats, q1) - mean) / i++;
mean += rem * (sorted_value(stats, q3) - mean) / i++;
}
return mean;
}
/**
* igt_stats_get_trimean:
* @stats: An #igt_stats_t instance
*
* Retrieves the [trimean](https://en.wikipedia.org/wiki/Trimean) of the @stats
* dataset.
*
* The trimean is a the most efficient 3-point L-estimator, even more
* robust than the median at estimating the average of a sample population.
*/
double igt_stats_get_trimean(igt_stats_t *stats)
{
double q1, q2, q3;
igt_stats_get_quartiles(stats, &q1, &q2, &q3);
return (q1 + 2*q2 + q3) / 4;
}
/**
* igt_mean_init:
* @m: tracking structure
*
* Initializes or resets @m.
*/
void igt_mean_init(struct igt_mean *m)
{
memset(m, 0, sizeof(*m));
m->max = -HUGE_VAL;
m->min = HUGE_VAL;
}
/**
* igt_mean_add:
* @m: tracking structure
* @v: value
*
* Adds a new value @v to @m.
*/
void igt_mean_add(struct igt_mean *m, double v)
{
double delta = v - m->mean;
m->mean += delta / ++m->count;
m->sq += delta * (v - m->mean);
if (v < m->min)
m->min = v;
if (v > m->max)
m->max = v;
}
/**
* igt_mean_get:
* @m: tracking structure
*
* Computes the current mean of the samples tracked in @m.
*/
double igt_mean_get(struct igt_mean *m)
{
return m->mean;
}
/**
* igt_mean_get_variance:
* @m: tracking structure
*
* Computes the current variance of the samples tracked in @m.
*/
double igt_mean_get_variance(struct igt_mean *m)
{
return m->sq / m->count;
}