You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1296 lines
40 KiB
1296 lines
40 KiB
/*
|
|
Copyright (c) 2013 Julien Pommier.
|
|
Copyright (c) 2019 Hayati Ayguen ( h_ayguen@web.de )
|
|
|
|
Small test & bench for PFFFT, comparing its performance with the scalar FFTPACK, FFTW, and Apple vDSP
|
|
|
|
How to build:
|
|
|
|
on linux, with fftw3:
|
|
gcc -o test_pffft -DHAVE_FFTW -msse -mfpmath=sse -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -lm
|
|
|
|
on macos, without fftw3:
|
|
clang -o test_pffft -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -framework Accelerate
|
|
|
|
on macos, with fftw3:
|
|
clang -o test_pffft -DHAVE_FFTW -DHAVE_VECLIB -O3 -Wall -W pffft.c test_pffft.c fftpack.c -L/usr/local/lib -I/usr/local/include/ -lfftw3f -framework Accelerate
|
|
|
|
as alternative: replace clang by gcc.
|
|
|
|
on windows, with visual c++:
|
|
cl /Ox -D_USE_MATH_DEFINES /arch:SSE test_pffft.c pffft.c fftpack.c
|
|
|
|
build without SIMD instructions:
|
|
gcc -o test_pffft -DPFFFT_SIMD_DISABLE -O3 -Wall -W pffft.c test_pffft.c fftpack.c -lm
|
|
|
|
*/
|
|
|
|
#define CONCAT_TOKENS(A, B) A ## B
|
|
#define CONCAT_THREE_TOKENS(A, B, C) A ## B ## C
|
|
|
|
#ifdef PFFFT_ENABLE_FLOAT
|
|
#include "pffft.h"
|
|
|
|
typedef float pffft_scalar;
|
|
typedef PFFFT_Setup PFFFT_SETUP;
|
|
#define PFFFT_FUNC(F) CONCAT_TOKENS(pffft_, F)
|
|
|
|
#else
|
|
/*
|
|
Note: adapted for double precision dynamic range version.
|
|
*/
|
|
#include "pffft_double.h"
|
|
|
|
typedef double pffft_scalar;
|
|
typedef PFFFTD_Setup PFFFT_SETUP;
|
|
#define PFFFT_FUNC(F) CONCAT_TOKENS(pffftd_, F)
|
|
#endif
|
|
|
|
#ifdef PFFFT_ENABLE_FLOAT
|
|
|
|
#include "fftpack.h"
|
|
|
|
#ifdef HAVE_GREEN_FFTS
|
|
#include "fftext.h"
|
|
#endif
|
|
|
|
#ifdef HAVE_KISS_FFT
|
|
#include <kiss_fft.h>
|
|
#include <kiss_fftr.h>
|
|
#endif
|
|
|
|
#endif
|
|
|
|
#ifdef HAVE_POCKET_FFT
|
|
#include <pocketfft_double.h>
|
|
#include <pocketfft_single.h>
|
|
#endif
|
|
|
|
#ifdef PFFFT_ENABLE_FLOAT
|
|
#define POCKFFTR_PRE(R) CONCAT_TOKENS(rffts, R)
|
|
#define POCKFFTC_PRE(R) CONCAT_TOKENS(cffts, R)
|
|
#define POCKFFTR_MID(L,R) CONCAT_THREE_TOKENS(L, rffts, R)
|
|
#define POCKFFTC_MID(L,R) CONCAT_THREE_TOKENS(L, cffts, R)
|
|
#else
|
|
#define POCKFFTR_PRE(R) CONCAT_TOKENS(rfft, R)
|
|
#define POCKFFTC_PRE(R) CONCAT_TOKENS(cfft, R)
|
|
#define POCKFFTR_MID(L,R) CONCAT_THREE_TOKENS(L, rfft, R)
|
|
#define POCKFFTC_MID(L,R) CONCAT_THREE_TOKENS(L, cfft, R)
|
|
#endif
|
|
|
|
|
|
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <time.h>
|
|
#include <assert.h>
|
|
#include <string.h>
|
|
|
|
#ifdef HAVE_SYS_TIMES
|
|
# include <sys/times.h>
|
|
# include <unistd.h>
|
|
#endif
|
|
|
|
#ifdef HAVE_VECLIB
|
|
# include <Accelerate/Accelerate.h>
|
|
#endif
|
|
|
|
#ifdef HAVE_FFTW
|
|
# include <fftw3.h>
|
|
|
|
#ifdef PFFFT_ENABLE_FLOAT
|
|
typedef fftwf_plan FFTW_PLAN;
|
|
typedef fftwf_complex FFTW_COMPLEX;
|
|
#define FFTW_FUNC(F) CONCAT_TOKENS(fftwf_, F)
|
|
#else
|
|
typedef fftw_plan FFTW_PLAN;
|
|
typedef fftw_complex FFTW_COMPLEX;
|
|
#define FFTW_FUNC(F) CONCAT_TOKENS(fftw_, F)
|
|
#endif
|
|
|
|
#endif /* HAVE_FFTW */
|
|
|
|
#ifndef M_LN2
|
|
#define M_LN2 0.69314718055994530942 /* log_e 2 */
|
|
#endif
|
|
|
|
|
|
#define NUM_FFT_ALGOS 9
|
|
enum {
|
|
ALGO_FFTPACK = 0,
|
|
ALGO_VECLIB,
|
|
ALGO_FFTW_ESTIM,
|
|
ALGO_FFTW_AUTO,
|
|
ALGO_GREEN,
|
|
ALGO_KISS,
|
|
ALGO_POCKET,
|
|
ALGO_PFFFT_U, /* = 7 */
|
|
ALGO_PFFFT_O /* = 8 */
|
|
};
|
|
|
|
#define NUM_TYPES 7
|
|
enum {
|
|
TYPE_PREP = 0, /* time for preparation in ms */
|
|
TYPE_DUR_NS = 1, /* time per fft in ns */
|
|
TYPE_DUR_FASTEST = 2, /* relative time to fastest */
|
|
TYPE_REL_PFFFT = 3, /* relative time to ALGO_PFFFT */
|
|
TYPE_ITER = 4, /* # of iterations in measurement */
|
|
TYPE_MFLOPS = 5, /* MFlops/sec */
|
|
TYPE_DUR_TOT = 6 /* test duration in sec */
|
|
};
|
|
/* double tmeas[NUM_TYPES][NUM_FFT_ALGOS]; */
|
|
|
|
const char * algoName[NUM_FFT_ALGOS] = {
|
|
"FFTPack ",
|
|
"vDSP (vec) ",
|
|
"FFTW F(estim)",
|
|
"FFTW F(auto) ",
|
|
"Green ",
|
|
"Kiss ",
|
|
"Pocket ",
|
|
"PFFFT-U(simd)", /* unordered */
|
|
"PFFFT (simd) " /* ordered */
|
|
};
|
|
|
|
|
|
int compiledInAlgo[NUM_FFT_ALGOS] = {
|
|
#ifdef PFFFT_ENABLE_FLOAT
|
|
1, /* "FFTPack " */
|
|
#else
|
|
0, /* "FFTPack " */
|
|
#endif
|
|
#if defined(HAVE_VECLIB) && defined(PFFFT_ENABLE_FLOAT)
|
|
1, /* "vDSP (vec) " */
|
|
#else
|
|
0,
|
|
#endif
|
|
#if defined(HAVE_FFTW)
|
|
1, /* "FFTW(estim)" */
|
|
1, /* "FFTW (auto)" */
|
|
#else
|
|
0, 0,
|
|
#endif
|
|
#if defined(HAVE_GREEN_FFTS) && defined(PFFFT_ENABLE_FLOAT)
|
|
1, /* "Green " */
|
|
#else
|
|
0,
|
|
#endif
|
|
#if defined(HAVE_KISS_FFT) && defined(PFFFT_ENABLE_FLOAT)
|
|
1, /* "Kiss " */
|
|
#else
|
|
0,
|
|
#endif
|
|
#if defined(HAVE_POCKET_FFT)
|
|
1, /* "Pocket " */
|
|
#else
|
|
0,
|
|
#endif
|
|
1, /* "PFFFT_U " */
|
|
1 /* "PFFFT_O " */
|
|
};
|
|
|
|
const char * algoTableHeader[NUM_FFT_ALGOS][2] = {
|
|
{ "| real FFTPack ", "| cplx FFTPack " },
|
|
{ "| real vDSP ", "| cplx vDSP " },
|
|
{ "|real FFTWestim", "|cplx FFTWestim" },
|
|
{ "|real FFTWauto ", "|cplx FFTWauto " },
|
|
{ "| real Green ", "| cplx Green " },
|
|
{ "| real Kiss ", "| cplx Kiss " },
|
|
{ "| real Pocket ", "| cplx Pocket " },
|
|
{ "| real PFFFT-U ", "| cplx PFFFT-U " },
|
|
{ "| real PFFFT ", "| cplx PFFFT " } };
|
|
|
|
const char * typeText[NUM_TYPES] = {
|
|
"preparation in ms",
|
|
"time per fft in ns",
|
|
"relative to fastest",
|
|
"relative to pffft",
|
|
"measured_num_iters",
|
|
"mflops",
|
|
"test duration in sec"
|
|
};
|
|
|
|
const char * typeFilenamePart[NUM_TYPES] = {
|
|
"1-preparation-in-ms",
|
|
"2-timePerFft-in-ns",
|
|
"3-rel-fastest",
|
|
"4-rel-pffft",
|
|
"5-num-iter",
|
|
"6-mflops",
|
|
"7-duration-in-sec"
|
|
};
|
|
|
|
#define SAVE_ALL_TYPES 0
|
|
|
|
const int saveType[NUM_TYPES] = {
|
|
1, /* "1-preparation-in-ms" */
|
|
0, /* "2-timePerFft-in-ns" */
|
|
0, /* "3-rel-fastest" */
|
|
1, /* "4-rel-pffft" */
|
|
1, /* "5-num-iter" */
|
|
1, /* "6-mflops" */
|
|
1, /* "7-duration-in-sec" */
|
|
};
|
|
|
|
|
|
#define MAX(x,y) ((x)>(y)?(x):(y))
|
|
#define MIN(x,y) ((x)<(y)?(x):(y))
|
|
|
|
unsigned Log2(unsigned v) {
|
|
/* we don't need speed records .. obvious way is good enough */
|
|
/* https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogObvious */
|
|
/* Find the log base 2 of an integer with the MSB N set in O(N) operations (the obvious way):
|
|
* unsigned v: 32-bit word to find the log base 2 of */
|
|
unsigned r = 0; /* r will be lg(v) */
|
|
while (v >>= 1)
|
|
{
|
|
r++;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
|
|
double frand() {
|
|
return rand()/(double)RAND_MAX;
|
|
}
|
|
|
|
#if defined(HAVE_SYS_TIMES)
|
|
inline double uclock_sec(void) {
|
|
static double ttclk = 0.;
|
|
struct tms t;
|
|
if (ttclk == 0.)
|
|
ttclk = sysconf(_SC_CLK_TCK);
|
|
times(&t);
|
|
/* use only the user time of this process - not realtime, which depends on OS-scheduler .. */
|
|
return ((double)t.tms_utime)) / ttclk;
|
|
}
|
|
# else
|
|
double uclock_sec(void)
|
|
{ return (double)clock()/(double)CLOCKS_PER_SEC; }
|
|
#endif
|
|
|
|
|
|
/* compare results with the regular fftpack */
|
|
void pffft_validate_N(int N, int cplx) {
|
|
|
|
#ifdef PFFFT_ENABLE_FLOAT
|
|
|
|
int Nfloat = N*(cplx?2:1);
|
|
int Nbytes = Nfloat * sizeof(pffft_scalar);
|
|
float *ref, *in, *out, *tmp, *tmp2;
|
|
PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
|
|
int pass;
|
|
|
|
|
|
if (!s) { printf("Skipping N=%d, not supported\n", N); return; }
|
|
ref = PFFFT_FUNC(aligned_malloc)(Nbytes);
|
|
in = PFFFT_FUNC(aligned_malloc)(Nbytes);
|
|
out = PFFFT_FUNC(aligned_malloc)(Nbytes);
|
|
tmp = PFFFT_FUNC(aligned_malloc)(Nbytes);
|
|
tmp2 = PFFFT_FUNC(aligned_malloc)(Nbytes);
|
|
|
|
for (pass=0; pass < 2; ++pass) {
|
|
float ref_max = 0;
|
|
int k;
|
|
/* printf("N=%d pass=%d cplx=%d\n", N, pass, cplx); */
|
|
/* compute reference solution with FFTPACK */
|
|
if (pass == 0) {
|
|
float *wrk = malloc(2*Nbytes+15*sizeof(pffft_scalar));
|
|
for (k=0; k < Nfloat; ++k) {
|
|
ref[k] = in[k] = (float)( frand()*2-1 );
|
|
out[k] = 1e30F;
|
|
}
|
|
if (!cplx) {
|
|
rffti(N, wrk);
|
|
rfftf(N, ref, wrk);
|
|
/* use our ordering for real ffts instead of the one of fftpack */
|
|
{
|
|
float refN=ref[N-1];
|
|
for (k=N-2; k >= 1; --k) ref[k+1] = ref[k];
|
|
ref[1] = refN;
|
|
}
|
|
} else {
|
|
cffti(N, wrk);
|
|
cfftf(N, ref, wrk);
|
|
}
|
|
free(wrk);
|
|
}
|
|
|
|
for (k = 0; k < Nfloat; ++k) ref_max = MAX(ref_max, (float)( fabs(ref[k]) ));
|
|
|
|
|
|
/* pass 0 : non canonical ordering of transform coefficients */
|
|
if (pass == 0) {
|
|
/* test forward transform, with different input / output */
|
|
PFFFT_FUNC(transform)(s, in, tmp, 0, PFFFT_FORWARD);
|
|
memcpy(tmp2, tmp, Nbytes);
|
|
memcpy(tmp, in, Nbytes);
|
|
PFFFT_FUNC(transform)(s, tmp, tmp, 0, PFFFT_FORWARD);
|
|
for (k = 0; k < Nfloat; ++k) {
|
|
assert(tmp2[k] == tmp[k]);
|
|
}
|
|
|
|
/* test reordering */
|
|
PFFFT_FUNC(zreorder)(s, tmp, out, PFFFT_FORWARD);
|
|
PFFFT_FUNC(zreorder)(s, out, tmp, PFFFT_BACKWARD);
|
|
for (k = 0; k < Nfloat; ++k) {
|
|
assert(tmp2[k] == tmp[k]);
|
|
}
|
|
PFFFT_FUNC(zreorder)(s, tmp, out, PFFFT_FORWARD);
|
|
} else {
|
|
/* pass 1 : canonical ordering of transform coeffs. */
|
|
PFFFT_FUNC(transform_ordered)(s, in, tmp, 0, PFFFT_FORWARD);
|
|
memcpy(tmp2, tmp, Nbytes);
|
|
memcpy(tmp, in, Nbytes);
|
|
PFFFT_FUNC(transform_ordered)(s, tmp, tmp, 0, PFFFT_FORWARD);
|
|
for (k = 0; k < Nfloat; ++k) {
|
|
assert(tmp2[k] == tmp[k]);
|
|
}
|
|
memcpy(out, tmp, Nbytes);
|
|
}
|
|
|
|
{
|
|
for (k=0; k < Nfloat; ++k) {
|
|
if (!(fabs(ref[k] - out[k]) < 1e-3*ref_max)) {
|
|
printf("%s forward PFFFT mismatch found for N=%d\n", (cplx?"CPLX":"REAL"), N);
|
|
exit(1);
|
|
}
|
|
}
|
|
|
|
if (pass == 0) PFFFT_FUNC(transform)(s, tmp, out, 0, PFFFT_BACKWARD);
|
|
else PFFFT_FUNC(transform_ordered)(s, tmp, out, 0, PFFFT_BACKWARD);
|
|
memcpy(tmp2, out, Nbytes);
|
|
memcpy(out, tmp, Nbytes);
|
|
if (pass == 0) PFFFT_FUNC(transform)(s, out, out, 0, PFFFT_BACKWARD);
|
|
else PFFFT_FUNC(transform_ordered)(s, out, out, 0, PFFFT_BACKWARD);
|
|
for (k = 0; k < Nfloat; ++k) {
|
|
assert(tmp2[k] == out[k]);
|
|
out[k] *= 1.f/N;
|
|
}
|
|
for (k = 0; k < Nfloat; ++k) {
|
|
if (fabs(in[k] - out[k]) > 1e-3 * ref_max) {
|
|
printf("pass=%d, %s IFFFT does not match for N=%d\n", pass, (cplx?"CPLX":"REAL"), N); break;
|
|
exit(1);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* quick test of the circular convolution in fft domain */
|
|
{
|
|
float conv_err = 0, conv_max = 0;
|
|
|
|
PFFFT_FUNC(zreorder)(s, ref, tmp, PFFFT_FORWARD);
|
|
memset(out, 0, Nbytes);
|
|
PFFFT_FUNC(zconvolve_accumulate)(s, ref, ref, out, 1.0);
|
|
PFFFT_FUNC(zreorder)(s, out, tmp2, PFFFT_FORWARD);
|
|
|
|
for (k=0; k < Nfloat; k += 2) {
|
|
float ar = tmp[k], ai=tmp[k+1];
|
|
if (cplx || k > 0) {
|
|
tmp[k] = ar*ar - ai*ai;
|
|
tmp[k+1] = 2*ar*ai;
|
|
} else {
|
|
tmp[0] = ar*ar;
|
|
tmp[1] = ai*ai;
|
|
}
|
|
}
|
|
|
|
for (k=0; k < Nfloat; ++k) {
|
|
float d = fabs(tmp[k] - tmp2[k]), e = fabs(tmp[k]);
|
|
if (d > conv_err) conv_err = d;
|
|
if (e > conv_max) conv_max = e;
|
|
}
|
|
if (conv_err > 1e-5*conv_max) {
|
|
printf("zconvolve error ? %g %g\n", conv_err, conv_max); exit(1);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
printf("%s PFFFT is OK for N=%d\n", (cplx?"CPLX":"REAL"), N); fflush(stdout);
|
|
|
|
PFFFT_FUNC(destroy_setup)(s);
|
|
PFFFT_FUNC(aligned_free)(ref);
|
|
PFFFT_FUNC(aligned_free)(in);
|
|
PFFFT_FUNC(aligned_free)(out);
|
|
PFFFT_FUNC(aligned_free)(tmp);
|
|
PFFFT_FUNC(aligned_free)(tmp2);
|
|
|
|
#endif /* PFFFT_ENABLE_FLOAT */
|
|
}
|
|
|
|
void pffft_validate(int cplx) {
|
|
static int Ntest[] = { 16, 32, 64, 96, 128, 160, 192, 256, 288, 384, 5*96, 512, 576, 5*128, 800, 864, 1024, 2048, 2592, 4000, 4096, 12000, 36864, 0};
|
|
int k;
|
|
for (k = 0; Ntest[k]; ++k) {
|
|
int N = Ntest[k];
|
|
if (N == 16 && !cplx) continue;
|
|
pffft_validate_N(N, cplx);
|
|
}
|
|
}
|
|
|
|
int array_output_format = 1;
|
|
|
|
|
|
void print_table(const char *txt, FILE *tableFile) {
|
|
fprintf(stdout, "%s", txt);
|
|
if (tableFile && tableFile != stdout)
|
|
fprintf(tableFile, "%s", txt);
|
|
}
|
|
|
|
void print_table_flops(float mflops, FILE *tableFile) {
|
|
fprintf(stdout, "|%11.0f ", mflops);
|
|
if (tableFile && tableFile != stdout)
|
|
fprintf(tableFile, "|%11.0f ", mflops);
|
|
}
|
|
|
|
void print_table_fftsize(int N, FILE *tableFile) {
|
|
fprintf(stdout, "|%9d ", N);
|
|
if (tableFile && tableFile != stdout)
|
|
fprintf(tableFile, "|%9d ", N);
|
|
}
|
|
|
|
double show_output(const char *name, int N, int cplx, float flops, float t0, float t1, int max_iter, FILE *tableFile) {
|
|
double T = (double)(t1-t0)/2/max_iter * 1e9;
|
|
float mflops = flops/1e6/(t1 - t0 + 1e-16);
|
|
if (array_output_format) {
|
|
if (flops != -1)
|
|
print_table_flops(mflops, tableFile);
|
|
else
|
|
print_table("| n/a ", tableFile);
|
|
} else {
|
|
if (flops != -1) {
|
|
printf("N=%5d, %s %16s : %6.0f MFlops [t=%6.0f ns, %d runs]\n", N, (cplx?"CPLX":"REAL"), name, mflops, (t1-t0)/2/max_iter * 1e9, max_iter);
|
|
}
|
|
}
|
|
fflush(stdout);
|
|
return T;
|
|
}
|
|
|
|
double cal_benchmark(int N, int cplx) {
|
|
const int log2N = Log2(N);
|
|
int Nfloat = (cplx ? N*2 : N);
|
|
int Nbytes = Nfloat * sizeof(pffft_scalar);
|
|
pffft_scalar *X = PFFFT_FUNC(aligned_malloc)(Nbytes), *Y = PFFFT_FUNC(aligned_malloc)(Nbytes), *Z = PFFFT_FUNC(aligned_malloc)(Nbytes);
|
|
double t0, t1, tstop, T, nI;
|
|
int k, iter;
|
|
|
|
assert( PFFFT_FUNC(is_power_of_two)(N) );
|
|
for (k = 0; k < Nfloat; ++k) {
|
|
X[k] = sqrtf(k+1);
|
|
}
|
|
|
|
/* PFFFT-U (unordered) benchmark */
|
|
PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
|
|
assert(s);
|
|
iter = 0;
|
|
t0 = uclock_sec();
|
|
tstop = t0 + 0.25; /* benchmark duration: 250 ms */
|
|
do {
|
|
for ( k = 0; k < 512; ++k ) {
|
|
PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_FORWARD);
|
|
PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_BACKWARD);
|
|
++iter;
|
|
}
|
|
t1 = uclock_sec();
|
|
} while ( t1 < tstop );
|
|
PFFFT_FUNC(destroy_setup)(s);
|
|
PFFFT_FUNC(aligned_free)(X);
|
|
PFFFT_FUNC(aligned_free)(Y);
|
|
PFFFT_FUNC(aligned_free)(Z);
|
|
|
|
T = ( t1 - t0 ); /* duration per fft() */
|
|
nI = ((double)iter) * ( log2N * N ); /* number of iterations "normalized" to O(N) = N*log2(N) */
|
|
return (nI / T); /* normalized iterations per second */
|
|
}
|
|
|
|
|
|
|
|
void benchmark_ffts(int N, int cplx, int withFFTWfullMeas, double iterCal, double tmeas[NUM_TYPES][NUM_FFT_ALGOS], int haveAlgo[NUM_FFT_ALGOS], FILE *tableFile ) {
|
|
const int log2N = Log2(N);
|
|
int nextPow2N = PFFFT_FUNC(next_power_of_two)(N);
|
|
int log2NextN = Log2(nextPow2N);
|
|
int pffftPow2N = nextPow2N;
|
|
|
|
int Nfloat = (cplx ? MAX(nextPow2N, pffftPow2N)*2 : MAX(nextPow2N, pffftPow2N));
|
|
int Nmax, k, iter;
|
|
int Nbytes = Nfloat * sizeof(pffft_scalar);
|
|
|
|
pffft_scalar *X = PFFFT_FUNC(aligned_malloc)(Nbytes + sizeof(pffft_scalar)), *Y = PFFFT_FUNC(aligned_malloc)(Nbytes + 2*sizeof(pffft_scalar) ), *Z = PFFFT_FUNC(aligned_malloc)(Nbytes);
|
|
double te, t0, t1, tstop, flops, Tfastest;
|
|
|
|
const double max_test_duration = 0.150; /* test duration 150 ms */
|
|
double numIter = max_test_duration * iterCal / ( log2N * N ); /* number of iteration for max_test_duration */
|
|
const int step_iter = MAX(1, ((int)(0.01 * numIter)) ); /* one hundredth */
|
|
int max_iter = MAX(1, ((int)numIter) ); /* minimum 1 iteration */
|
|
|
|
const float checkVal = 12345.0F;
|
|
|
|
/* printf("benchmark_ffts(N = %d, cplx = %d): Nfloat = %d, X_mem = 0x%p, X = %p\n", N, cplx, Nfloat, X_mem, X); */
|
|
|
|
memset( X, 0, Nfloat * sizeof(pffft_scalar) );
|
|
if ( Nfloat < 32 ) {
|
|
for (k = 0; k < Nfloat; k += 4)
|
|
X[k] = sqrtf(k+1);
|
|
} else {
|
|
for (k = 0; k < Nfloat; k += (Nfloat/16) )
|
|
X[k] = sqrtf(k+1);
|
|
}
|
|
|
|
for ( k = 0; k < NUM_TYPES; ++k )
|
|
{
|
|
for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
|
|
tmeas[k][iter] = 0.0;
|
|
}
|
|
|
|
|
|
/* FFTPack benchmark */
|
|
Nmax = (cplx ? N*2 : N);
|
|
X[Nmax] = checkVal;
|
|
#ifdef PFFFT_ENABLE_FLOAT
|
|
{
|
|
float *wrk = malloc(2*Nbytes + 15*sizeof(pffft_scalar));
|
|
te = uclock_sec();
|
|
if (cplx) cffti(N, wrk);
|
|
else rffti(N, wrk);
|
|
t0 = uclock_sec();
|
|
tstop = t0 + max_test_duration;
|
|
max_iter = 0;
|
|
do {
|
|
for ( k = 0; k < step_iter; ++k ) {
|
|
if (cplx) {
|
|
assert( X[Nmax] == checkVal );
|
|
cfftf(N, X, wrk);
|
|
assert( X[Nmax] == checkVal );
|
|
cfftb(N, X, wrk);
|
|
assert( X[Nmax] == checkVal );
|
|
} else {
|
|
assert( X[Nmax] == checkVal );
|
|
rfftf(N, X, wrk);
|
|
assert( X[Nmax] == checkVal );
|
|
rfftb(N, X, wrk);
|
|
assert( X[Nmax] == checkVal );
|
|
}
|
|
++max_iter;
|
|
}
|
|
t1 = uclock_sec();
|
|
} while ( t1 < tstop );
|
|
|
|
free(wrk);
|
|
|
|
flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
|
|
tmeas[TYPE_ITER][ALGO_FFTPACK] = max_iter;
|
|
tmeas[TYPE_MFLOPS][ALGO_FFTPACK] = flops/1e6/(t1 - t0 + 1e-16);
|
|
tmeas[TYPE_DUR_TOT][ALGO_FFTPACK] = t1 - t0;
|
|
tmeas[TYPE_DUR_NS][ALGO_FFTPACK] = show_output("FFTPack", N, cplx, flops, t0, t1, max_iter, tableFile);
|
|
tmeas[TYPE_PREP][ALGO_FFTPACK] = (t0 - te) * 1e3;
|
|
haveAlgo[ALGO_FFTPACK] = 1;
|
|
}
|
|
#endif
|
|
|
|
#if defined(HAVE_VECLIB) && defined(PFFFT_ENABLE_FLOAT)
|
|
Nmax = (cplx ? nextPow2N*2 : nextPow2N);
|
|
X[Nmax] = checkVal;
|
|
te = uclock_sec();
|
|
if ( 1 || PFFFT_FUNC(is_power_of_two)(N) ) {
|
|
FFTSetup setup;
|
|
|
|
setup = vDSP_create_fftsetup(log2NextN, FFT_RADIX2);
|
|
DSPSplitComplex zsamples;
|
|
zsamples.realp = &X[0];
|
|
zsamples.imagp = &X[Nfloat/2];
|
|
t0 = uclock_sec();
|
|
tstop = t0 + max_test_duration;
|
|
max_iter = 0;
|
|
do {
|
|
for ( k = 0; k < step_iter; ++k ) {
|
|
if (cplx) {
|
|
assert( X[Nmax] == checkVal );
|
|
vDSP_fft_zip(setup, &zsamples, 1, log2NextN, kFFTDirection_Forward);
|
|
assert( X[Nmax] == checkVal );
|
|
vDSP_fft_zip(setup, &zsamples, 1, log2NextN, kFFTDirection_Inverse);
|
|
assert( X[Nmax] == checkVal );
|
|
} else {
|
|
assert( X[Nmax] == checkVal );
|
|
vDSP_fft_zrip(setup, &zsamples, 1, log2NextN, kFFTDirection_Forward);
|
|
assert( X[Nmax] == checkVal );
|
|
vDSP_fft_zrip(setup, &zsamples, 1, log2NextN, kFFTDirection_Inverse);
|
|
assert( X[Nmax] == checkVal );
|
|
}
|
|
++max_iter;
|
|
}
|
|
t1 = uclock_sec();
|
|
} while ( t1 < tstop );
|
|
|
|
vDSP_destroy_fftsetup(setup);
|
|
flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
|
|
tmeas[TYPE_ITER][ALGO_VECLIB] = max_iter;
|
|
tmeas[TYPE_MFLOPS][ALGO_VECLIB] = flops/1e6/(t1 - t0 + 1e-16);
|
|
tmeas[TYPE_DUR_TOT][ALGO_VECLIB] = t1 - t0;
|
|
tmeas[TYPE_DUR_NS][ALGO_VECLIB] = show_output("vDSP", N, cplx, flops, t0, t1, max_iter, tableFile);
|
|
tmeas[TYPE_PREP][ALGO_VECLIB] = (t0 - te) * 1e3;
|
|
haveAlgo[ALGO_VECLIB] = 1;
|
|
} else {
|
|
show_output("vDSP", N, cplx, -1, -1, -1, -1, tableFile);
|
|
}
|
|
#endif
|
|
|
|
#if defined(HAVE_FFTW)
|
|
Nmax = (cplx ? N*2 : N);
|
|
X[Nmax] = checkVal;
|
|
{
|
|
/* int flags = (N <= (256*1024) ? FFTW_MEASURE : FFTW_ESTIMATE); measure takes a lot of time on largest ffts */
|
|
int flags = FFTW_ESTIMATE;
|
|
te = uclock_sec();
|
|
|
|
FFTW_PLAN planf, planb;
|
|
FFTW_COMPLEX *in = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
|
|
FFTW_COMPLEX *out = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
|
|
memset(in, 0, sizeof(FFTW_COMPLEX) * N);
|
|
if (cplx) {
|
|
planf = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_FORWARD, flags);
|
|
planb = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_BACKWARD, flags);
|
|
} else {
|
|
planf = FFTW_FUNC(plan_dft_r2c_1d)(N, (pffft_scalar*)in, out, flags);
|
|
planb = FFTW_FUNC(plan_dft_c2r_1d)(N, in, (pffft_scalar*)out, flags);
|
|
}
|
|
|
|
t0 = uclock_sec();
|
|
tstop = t0 + max_test_duration;
|
|
max_iter = 0;
|
|
do {
|
|
for ( k = 0; k < step_iter; ++k ) {
|
|
assert( X[Nmax] == checkVal );
|
|
FFTW_FUNC(execute)(planf);
|
|
assert( X[Nmax] == checkVal );
|
|
FFTW_FUNC(execute)(planb);
|
|
assert( X[Nmax] == checkVal );
|
|
++max_iter;
|
|
}
|
|
t1 = uclock_sec();
|
|
} while ( t1 < tstop );
|
|
|
|
FFTW_FUNC(destroy_plan)(planf);
|
|
FFTW_FUNC(destroy_plan)(planb);
|
|
FFTW_FUNC(free)(in); FFTW_FUNC(free)(out);
|
|
|
|
flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
|
|
tmeas[TYPE_ITER][ALGO_FFTW_ESTIM] = max_iter;
|
|
tmeas[TYPE_MFLOPS][ALGO_FFTW_ESTIM] = flops/1e6/(t1 - t0 + 1e-16);
|
|
tmeas[TYPE_DUR_TOT][ALGO_FFTW_ESTIM] = t1 - t0;
|
|
tmeas[TYPE_DUR_NS][ALGO_FFTW_ESTIM] = show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, flops, t0, t1, max_iter, tableFile);
|
|
tmeas[TYPE_PREP][ALGO_FFTW_ESTIM] = (t0 - te) * 1e3;
|
|
haveAlgo[ALGO_FFTW_ESTIM] = 1;
|
|
}
|
|
Nmax = (cplx ? N*2 : N);
|
|
X[Nmax] = checkVal;
|
|
do {
|
|
/* int flags = (N <= (256*1024) ? FFTW_MEASURE : FFTW_ESTIMATE); measure takes a lot of time on largest ffts */
|
|
/* int flags = FFTW_MEASURE; */
|
|
#if ( defined(__arm__) || defined(__aarch64__) || defined(__arm64__) )
|
|
int limitFFTsize = 31; /* takes over a second on Raspberry Pi 3 B+ -- and much much more on higher ffts sizes! */
|
|
#else
|
|
int limitFFTsize = 2400; /* take over a second on i7 for fft size 2400 */
|
|
#endif
|
|
int flags = (N < limitFFTsize ? FFTW_MEASURE : (withFFTWfullMeas ? FFTW_MEASURE : FFTW_ESTIMATE));
|
|
|
|
if (flags == FFTW_ESTIMATE) {
|
|
show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, -1, -1, -1, -1, tableFile);
|
|
/* copy values from estimation */
|
|
tmeas[TYPE_ITER][ALGO_FFTW_AUTO] = tmeas[TYPE_ITER][ALGO_FFTW_ESTIM];
|
|
tmeas[TYPE_DUR_TOT][ALGO_FFTW_AUTO] = tmeas[TYPE_DUR_TOT][ALGO_FFTW_ESTIM];
|
|
tmeas[TYPE_DUR_NS][ALGO_FFTW_AUTO] = tmeas[TYPE_DUR_NS][ALGO_FFTW_ESTIM];
|
|
tmeas[TYPE_PREP][ALGO_FFTW_AUTO] = tmeas[TYPE_PREP][ALGO_FFTW_ESTIM];
|
|
} else {
|
|
te = uclock_sec();
|
|
FFTW_PLAN planf, planb;
|
|
FFTW_COMPLEX *in = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
|
|
FFTW_COMPLEX *out = (FFTW_COMPLEX*) FFTW_FUNC(malloc)(sizeof(FFTW_COMPLEX) * N);
|
|
memset(in, 0, sizeof(FFTW_COMPLEX) * N);
|
|
if (cplx) {
|
|
planf = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_FORWARD, flags);
|
|
planb = FFTW_FUNC(plan_dft_1d)(N, in, out, FFTW_BACKWARD, flags);
|
|
} else {
|
|
planf = FFTW_FUNC(plan_dft_r2c_1d)(N, (pffft_scalar*)in, out, flags);
|
|
planb = FFTW_FUNC(plan_dft_c2r_1d)(N, in, (pffft_scalar*)out, flags);
|
|
}
|
|
|
|
t0 = uclock_sec();
|
|
tstop = t0 + max_test_duration;
|
|
max_iter = 0;
|
|
do {
|
|
for ( k = 0; k < step_iter; ++k ) {
|
|
assert( X[Nmax] == checkVal );
|
|
FFTW_FUNC(execute)(planf);
|
|
assert( X[Nmax] == checkVal );
|
|
FFTW_FUNC(execute)(planb);
|
|
assert( X[Nmax] == checkVal );
|
|
++max_iter;
|
|
}
|
|
t1 = uclock_sec();
|
|
} while ( t1 < tstop );
|
|
|
|
FFTW_FUNC(destroy_plan)(planf);
|
|
FFTW_FUNC(destroy_plan)(planb);
|
|
FFTW_FUNC(free)(in); FFTW_FUNC(free)(out);
|
|
|
|
flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
|
|
tmeas[TYPE_ITER][ALGO_FFTW_AUTO] = max_iter;
|
|
tmeas[TYPE_MFLOPS][ALGO_FFTW_AUTO] = flops/1e6/(t1 - t0 + 1e-16);
|
|
tmeas[TYPE_DUR_TOT][ALGO_FFTW_AUTO] = t1 - t0;
|
|
tmeas[TYPE_DUR_NS][ALGO_FFTW_AUTO] = show_output((flags == FFTW_MEASURE ? algoName[ALGO_FFTW_AUTO] : algoName[ALGO_FFTW_ESTIM]), N, cplx, flops, t0, t1, max_iter, tableFile);
|
|
tmeas[TYPE_PREP][ALGO_FFTW_AUTO] = (t0 - te) * 1e3;
|
|
haveAlgo[ALGO_FFTW_AUTO] = 1;
|
|
}
|
|
} while (0);
|
|
#else
|
|
(void)withFFTWfullMeas;
|
|
#endif
|
|
|
|
#if defined(HAVE_GREEN_FFTS) && defined(PFFFT_ENABLE_FLOAT)
|
|
Nmax = (cplx ? nextPow2N*2 : nextPow2N);
|
|
X[Nmax] = checkVal;
|
|
if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
|
|
{
|
|
te = uclock_sec();
|
|
fftInit(log2NextN);
|
|
|
|
t0 = uclock_sec();
|
|
tstop = t0 + max_test_duration;
|
|
max_iter = 0;
|
|
do {
|
|
for ( k = 0; k < step_iter; ++k ) {
|
|
if (cplx) {
|
|
assert( X[Nmax] == checkVal );
|
|
ffts(X, log2NextN, 1);
|
|
assert( X[Nmax] == checkVal );
|
|
iffts(X, log2NextN, 1);
|
|
assert( X[Nmax] == checkVal );
|
|
} else {
|
|
rffts(X, log2NextN, 1);
|
|
riffts(X, log2NextN, 1);
|
|
}
|
|
|
|
++max_iter;
|
|
}
|
|
t1 = uclock_sec();
|
|
} while ( t1 < tstop );
|
|
|
|
fftFree();
|
|
|
|
flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
|
|
tmeas[TYPE_ITER][ALGO_GREEN] = max_iter;
|
|
tmeas[TYPE_MFLOPS][ALGO_GREEN] = flops/1e6/(t1 - t0 + 1e-16);
|
|
tmeas[TYPE_DUR_TOT][ALGO_GREEN] = t1 - t0;
|
|
tmeas[TYPE_DUR_NS][ALGO_GREEN] = show_output("Green", N, cplx, flops, t0, t1, max_iter, tableFile);
|
|
tmeas[TYPE_PREP][ALGO_GREEN] = (t0 - te) * 1e3;
|
|
haveAlgo[ALGO_GREEN] = 1;
|
|
} else {
|
|
show_output("Green", N, cplx, -1, -1, -1, -1, tableFile);
|
|
}
|
|
#endif
|
|
|
|
#if defined(HAVE_KISS_FFT) && defined(PFFFT_ENABLE_FLOAT)
|
|
Nmax = (cplx ? nextPow2N*2 : nextPow2N);
|
|
X[Nmax] = checkVal;
|
|
if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
|
|
{
|
|
kiss_fft_cfg stf;
|
|
kiss_fft_cfg sti;
|
|
kiss_fftr_cfg stfr;
|
|
kiss_fftr_cfg stir;
|
|
|
|
te = uclock_sec();
|
|
if (cplx) {
|
|
stf = kiss_fft_alloc(nextPow2N, 0, 0, 0);
|
|
sti = kiss_fft_alloc(nextPow2N, 1, 0, 0);
|
|
} else {
|
|
stfr = kiss_fftr_alloc(nextPow2N, 0, 0, 0);
|
|
stir = kiss_fftr_alloc(nextPow2N, 1, 0, 0);
|
|
}
|
|
|
|
t0 = uclock_sec();
|
|
tstop = t0 + max_test_duration;
|
|
max_iter = 0;
|
|
do {
|
|
for ( k = 0; k < step_iter; ++k ) {
|
|
if (cplx) {
|
|
assert( X[Nmax] == checkVal );
|
|
kiss_fft(stf, (const kiss_fft_cpx *)X, (kiss_fft_cpx *)Y);
|
|
assert( X[Nmax] == checkVal );
|
|
kiss_fft(sti, (const kiss_fft_cpx *)Y, (kiss_fft_cpx *)X);
|
|
assert( X[Nmax] == checkVal );
|
|
} else {
|
|
assert( X[Nmax] == checkVal );
|
|
kiss_fftr(stfr, X, (kiss_fft_cpx *)Y);
|
|
assert( X[Nmax] == checkVal );
|
|
kiss_fftri(stir, (const kiss_fft_cpx *)Y, X);
|
|
assert( X[Nmax] == checkVal );
|
|
}
|
|
++max_iter;
|
|
}
|
|
t1 = uclock_sec();
|
|
} while ( t1 < tstop );
|
|
|
|
kiss_fft_cleanup();
|
|
|
|
flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
|
|
tmeas[TYPE_ITER][ALGO_KISS] = max_iter;
|
|
tmeas[TYPE_MFLOPS][ALGO_KISS] = flops/1e6/(t1 - t0 + 1e-16);
|
|
tmeas[TYPE_DUR_TOT][ALGO_KISS] = t1 - t0;
|
|
tmeas[TYPE_DUR_NS][ALGO_KISS] = show_output("Kiss", N, cplx, flops, t0, t1, max_iter, tableFile);
|
|
tmeas[TYPE_PREP][ALGO_KISS] = (t0 - te) * 1e3;
|
|
haveAlgo[ALGO_KISS] = 1;
|
|
} else {
|
|
show_output("Kiss", N, cplx, -1, -1, -1, -1, tableFile);
|
|
}
|
|
#endif
|
|
|
|
#if defined(HAVE_POCKET_FFT)
|
|
|
|
Nmax = (cplx ? nextPow2N*2 : nextPow2N);
|
|
X[Nmax] = checkVal;
|
|
if ( 1 || PFFFT_FUNC(is_power_of_two)(N) )
|
|
{
|
|
POCKFFTR_PRE(_plan) planr;
|
|
POCKFFTC_PRE(_plan) planc;
|
|
|
|
te = uclock_sec();
|
|
if (cplx) {
|
|
planc = POCKFFTC_MID(make_,_plan)(nextPow2N);
|
|
} else {
|
|
planr = POCKFFTR_MID(make_,_plan)(nextPow2N);
|
|
}
|
|
|
|
t0 = uclock_sec();
|
|
tstop = t0 + max_test_duration;
|
|
max_iter = 0;
|
|
do {
|
|
for ( k = 0; k < step_iter; ++k ) {
|
|
if (cplx) {
|
|
assert( X[Nmax] == checkVal );
|
|
memcpy(Y, X, 2*nextPow2N * sizeof(pffft_scalar) );
|
|
POCKFFTC_PRE(_forward)(planc, Y, 1.);
|
|
assert( X[Nmax] == checkVal );
|
|
memcpy(X, Y, 2*nextPow2N * sizeof(pffft_scalar) );
|
|
POCKFFTC_PRE(_backward)(planc, X, 1./nextPow2N);
|
|
assert( X[Nmax] == checkVal );
|
|
} else {
|
|
assert( X[Nmax] == checkVal );
|
|
memcpy(Y, X, nextPow2N * sizeof(pffft_scalar) );
|
|
POCKFFTR_PRE(_forward)(planr, Y, 1.);
|
|
assert( X[Nmax] == checkVal );
|
|
memcpy(X, Y, nextPow2N * sizeof(pffft_scalar) );
|
|
POCKFFTR_PRE(_backward)(planr, X, 1./nextPow2N);
|
|
assert( X[Nmax] == checkVal );
|
|
}
|
|
++max_iter;
|
|
}
|
|
t1 = uclock_sec();
|
|
} while ( t1 < tstop );
|
|
|
|
if (cplx) {
|
|
POCKFFTC_MID(destroy_,_plan)(planc);
|
|
} else {
|
|
POCKFFTR_MID(destroy_,_plan)(planr);
|
|
}
|
|
|
|
flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
|
|
tmeas[TYPE_ITER][ALGO_POCKET] = max_iter;
|
|
tmeas[TYPE_MFLOPS][ALGO_POCKET] = flops/1e6/(t1 - t0 + 1e-16);
|
|
tmeas[TYPE_DUR_TOT][ALGO_POCKET] = t1 - t0;
|
|
tmeas[TYPE_DUR_NS][ALGO_POCKET] = show_output("Pocket", N, cplx, flops, t0, t1, max_iter, tableFile);
|
|
tmeas[TYPE_PREP][ALGO_POCKET] = (t0 - te) * 1e3;
|
|
haveAlgo[ALGO_POCKET] = 1;
|
|
} else {
|
|
show_output("Pocket", N, cplx, -1, -1, -1, -1, tableFile);
|
|
}
|
|
#endif
|
|
|
|
|
|
/* PFFFT-U (unordered) benchmark */
|
|
Nmax = (cplx ? pffftPow2N*2 : pffftPow2N);
|
|
X[Nmax] = checkVal;
|
|
if ( pffftPow2N >= PFFFT_FUNC(min_fft_size)(cplx ? PFFFT_COMPLEX : PFFFT_REAL) )
|
|
{
|
|
te = uclock_sec();
|
|
PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(pffftPow2N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
|
|
if (s) {
|
|
t0 = uclock_sec();
|
|
tstop = t0 + max_test_duration;
|
|
max_iter = 0;
|
|
do {
|
|
for ( k = 0; k < step_iter; ++k ) {
|
|
assert( X[Nmax] == checkVal );
|
|
PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_FORWARD);
|
|
assert( X[Nmax] == checkVal );
|
|
PFFFT_FUNC(transform)(s, X, Z, Y, PFFFT_BACKWARD);
|
|
assert( X[Nmax] == checkVal );
|
|
++max_iter;
|
|
}
|
|
t1 = uclock_sec();
|
|
} while ( t1 < tstop );
|
|
|
|
PFFFT_FUNC(destroy_setup)(s);
|
|
|
|
flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
|
|
tmeas[TYPE_ITER][ALGO_PFFFT_U] = max_iter;
|
|
tmeas[TYPE_MFLOPS][ALGO_PFFFT_U] = flops/1e6/(t1 - t0 + 1e-16);
|
|
tmeas[TYPE_DUR_TOT][ALGO_PFFFT_U] = t1 - t0;
|
|
tmeas[TYPE_DUR_NS][ALGO_PFFFT_U] = show_output("PFFFT-U", N, cplx, flops, t0, t1, max_iter, tableFile);
|
|
tmeas[TYPE_PREP][ALGO_PFFFT_U] = (t0 - te) * 1e3;
|
|
haveAlgo[ALGO_PFFFT_U] = 1;
|
|
}
|
|
} else {
|
|
show_output("PFFFT-U", N, cplx, -1, -1, -1, -1, tableFile);
|
|
}
|
|
|
|
|
|
if ( pffftPow2N >= PFFFT_FUNC(min_fft_size)(cplx ? PFFFT_COMPLEX : PFFFT_REAL) )
|
|
{
|
|
te = uclock_sec();
|
|
PFFFT_SETUP *s = PFFFT_FUNC(new_setup)(pffftPow2N, cplx ? PFFFT_COMPLEX : PFFFT_REAL);
|
|
if (s) {
|
|
t0 = uclock_sec();
|
|
tstop = t0 + max_test_duration;
|
|
max_iter = 0;
|
|
do {
|
|
for ( k = 0; k < step_iter; ++k ) {
|
|
assert( X[Nmax] == checkVal );
|
|
PFFFT_FUNC(transform_ordered)(s, X, Z, Y, PFFFT_FORWARD);
|
|
assert( X[Nmax] == checkVal );
|
|
PFFFT_FUNC(transform_ordered)(s, X, Z, Y, PFFFT_BACKWARD);
|
|
assert( X[Nmax] == checkVal );
|
|
++max_iter;
|
|
}
|
|
t1 = uclock_sec();
|
|
} while ( t1 < tstop );
|
|
|
|
PFFFT_FUNC(destroy_setup)(s);
|
|
|
|
flops = (max_iter*2) * ((cplx ? 5 : 2.5)*N*log((double)N)/M_LN2); /* see http://www.fftw.org/speed/method.html */
|
|
tmeas[TYPE_ITER][ALGO_PFFFT_O] = max_iter;
|
|
tmeas[TYPE_MFLOPS][ALGO_PFFFT_O] = flops/1e6/(t1 - t0 + 1e-16);
|
|
tmeas[TYPE_DUR_TOT][ALGO_PFFFT_O] = t1 - t0;
|
|
tmeas[TYPE_DUR_NS][ALGO_PFFFT_O] = show_output("PFFFT", N, cplx, flops, t0, t1, max_iter, tableFile);
|
|
tmeas[TYPE_PREP][ALGO_PFFFT_O] = (t0 - te) * 1e3;
|
|
haveAlgo[ALGO_PFFFT_O] = 1;
|
|
}
|
|
} else {
|
|
show_output("PFFFT", N, cplx, -1, -1, -1, -1, tableFile);
|
|
}
|
|
|
|
if (!array_output_format)
|
|
{
|
|
printf("prepare/ms: ");
|
|
for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
|
|
{
|
|
if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) {
|
|
printf("%s %.3f ", algoName[iter], tmeas[TYPE_PREP][iter] );
|
|
}
|
|
}
|
|
printf("\n");
|
|
}
|
|
Tfastest = 0.0;
|
|
for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
|
|
{
|
|
if ( Tfastest == 0.0 || ( tmeas[TYPE_DUR_NS][iter] != 0.0 && tmeas[TYPE_DUR_NS][iter] < Tfastest ) )
|
|
Tfastest = tmeas[TYPE_DUR_NS][iter];
|
|
}
|
|
if ( Tfastest > 0.0 )
|
|
{
|
|
if (!array_output_format)
|
|
printf("relative fast: ");
|
|
for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
|
|
{
|
|
if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) {
|
|
tmeas[TYPE_DUR_FASTEST][iter] = tmeas[TYPE_DUR_NS][iter] / Tfastest;
|
|
if (!array_output_format)
|
|
printf("%s %.3f ", algoName[iter], tmeas[TYPE_DUR_FASTEST][iter] );
|
|
}
|
|
}
|
|
if (!array_output_format)
|
|
printf("\n");
|
|
}
|
|
|
|
{
|
|
if (!array_output_format)
|
|
printf("relative pffft: ");
|
|
for ( iter = 0; iter < NUM_FFT_ALGOS; ++iter )
|
|
{
|
|
if ( haveAlgo[iter] && tmeas[TYPE_DUR_NS][iter] > 0.0 ) {
|
|
tmeas[TYPE_REL_PFFFT][iter] = tmeas[TYPE_DUR_NS][iter] / tmeas[TYPE_DUR_NS][ALGO_PFFFT_O];
|
|
if (!array_output_format)
|
|
printf("%s %.3f ", algoName[iter], tmeas[TYPE_REL_PFFFT][iter] );
|
|
}
|
|
}
|
|
if (!array_output_format)
|
|
printf("\n");
|
|
}
|
|
|
|
if (!array_output_format) {
|
|
printf("--\n");
|
|
}
|
|
|
|
PFFFT_FUNC(aligned_free)(X);
|
|
PFFFT_FUNC(aligned_free)(Y);
|
|
PFFFT_FUNC(aligned_free)(Z);
|
|
}
|
|
|
|
|
|
/* small functions inside pffft.c that will detect (compiler) bugs with respect to simd instructions */
|
|
void validate_pffft_simd();
|
|
int validate_pffft_simd_ex(FILE * DbgOut);
|
|
void validate_pffftd_simd();
|
|
int validate_pffftd_simd_ex(FILE * DbgOut);
|
|
|
|
|
|
|
|
int main(int argc, char **argv) {
|
|
/* unfortunately, the fft size must be a multiple of 16 for complex FFTs
|
|
and 32 for real FFTs -- a lot of stuff would need to be rewritten to
|
|
handle other cases (or maybe just switch to a scalar fft, I don't know..) */
|
|
|
|
#if 0 /* include powers of 2 ? */
|
|
#define NUMNONPOW2LENS 23
|
|
int NnonPow2[NUMNONPOW2LENS] = {
|
|
64, 96, 128, 160, 192, 256, 384, 5*96, 512, 5*128,
|
|
3*256, 800, 1024, 2048, 2400, 4096, 8192, 9*1024, 16384, 32768,
|
|
256*1024, 1024*1024, -1 };
|
|
#else
|
|
#define NUMNONPOW2LENS 11
|
|
int NnonPow2[NUMNONPOW2LENS] = {
|
|
96, 160, 192, 384, 5*96, 5*128,3*256, 800, 2400, 9*1024,
|
|
-1 };
|
|
#endif
|
|
|
|
#define NUMPOW2FFTLENS 22
|
|
#define MAXNUMFFTLENS MAX( NUMPOW2FFTLENS, NUMNONPOW2LENS )
|
|
int Npow2[NUMPOW2FFTLENS]; /* exp = 1 .. 21, -1 */
|
|
const int *Nvalues = NULL;
|
|
double tmeas[2][MAXNUMFFTLENS][NUM_TYPES][NUM_FFT_ALGOS];
|
|
double iterCalReal, iterCalCplx;
|
|
|
|
int benchReal=1, benchCplx=1, withFFTWfullMeas=0, outputTable2File=1, usePow2=1;
|
|
int realCplxIdx, typeIdx;
|
|
int i, k;
|
|
FILE *tableFile = NULL;
|
|
|
|
int haveAlgo[NUM_FFT_ALGOS];
|
|
char acCsvFilename[64];
|
|
|
|
for ( k = 1; k <= NUMPOW2FFTLENS; ++k )
|
|
Npow2[k-1] = (k == NUMPOW2FFTLENS) ? -1 : (1 << k);
|
|
Nvalues = Npow2; /* set default .. for comparisons .. */
|
|
|
|
for ( i = 0; i < NUM_FFT_ALGOS; ++i )
|
|
haveAlgo[i] = 0;
|
|
|
|
printf("pffft architecture: '%s'\n", PFFFT_FUNC(simd_arch)());
|
|
printf("pffft SIMD size: %d\n", PFFFT_FUNC(simd_size)());
|
|
printf("pffft min real fft: %d\n", PFFFT_FUNC(min_fft_size)(PFFFT_REAL));
|
|
printf("pffft min complex fft: %d\n", PFFFT_FUNC(min_fft_size)(PFFFT_COMPLEX));
|
|
printf("\n");
|
|
|
|
for ( i = 1; i < argc; ++i ) {
|
|
if (!strcmp(argv[i], "--array-format") || !strcmp(argv[i], "--table")) {
|
|
array_output_format = 1;
|
|
}
|
|
else if (!strcmp(argv[i], "--no-tab")) {
|
|
array_output_format = 0;
|
|
}
|
|
else if (!strcmp(argv[i], "--real")) {
|
|
benchCplx = 0;
|
|
}
|
|
else if (!strcmp(argv[i], "--cplx")) {
|
|
benchReal = 0;
|
|
}
|
|
else if (!strcmp(argv[i], "--fftw-full-measure")) {
|
|
withFFTWfullMeas = 1;
|
|
}
|
|
else if (!strcmp(argv[i], "--non-pow2")) {
|
|
Nvalues = NnonPow2;
|
|
usePow2 = 0;
|
|
}
|
|
else /* if (!strcmp(argv[i], "--help")) */ {
|
|
printf("usage: %s [--array-format|--table] [--no-tab] [--real|--cplx] [--fftw-full-measure] [--non-pow2]\n", argv[0]);
|
|
exit(0);
|
|
}
|
|
}
|
|
|
|
#ifdef HAVE_FFTW
|
|
#ifdef PFFFT_ENABLE_DOUBLE
|
|
algoName[ALGO_FFTW_ESTIM] = "FFTW D(estim)";
|
|
algoName[ALGO_FFTW_AUTO] = "FFTW D(auto) ";
|
|
#endif
|
|
|
|
if (withFFTWfullMeas)
|
|
{
|
|
#ifdef PFFFT_ENABLE_FLOAT
|
|
algoName[ALGO_FFTW_AUTO] = "FFTWF(meas)"; /* "FFTW (auto)" */
|
|
#else
|
|
algoName[ALGO_FFTW_AUTO] = "FFTWD(meas)"; /* "FFTW (auto)" */
|
|
#endif
|
|
algoTableHeader[NUM_FFT_ALGOS][0] = "|real FFTWmeas "; /* "|real FFTWauto " */
|
|
algoTableHeader[NUM_FFT_ALGOS][0] = "|cplx FFTWmeas "; /* "|cplx FFTWauto " */
|
|
}
|
|
#endif
|
|
|
|
if ( PFFFT_FUNC(simd_size)() == 1 )
|
|
{
|
|
algoName[ALGO_PFFFT_U] = "PFFFTU scal-1";
|
|
algoName[ALGO_PFFFT_O] = "PFFFT scal-1 ";
|
|
}
|
|
else if ( !strcmp(PFFFT_FUNC(simd_arch)(), "4xScalar") )
|
|
{
|
|
algoName[ALGO_PFFFT_U] = "PFFFT-U scal4";
|
|
algoName[ALGO_PFFFT_O] = "PFFFT scal-4 ";
|
|
}
|
|
|
|
|
|
clock();
|
|
/* double TClockDur = 1.0 / CLOCKS_PER_SEC;
|
|
printf("clock() duration for CLOCKS_PER_SEC = %f sec = %f ms\n", TClockDur, 1000.0 * TClockDur );
|
|
*/
|
|
|
|
/* calibrate test duration */
|
|
{
|
|
double t0, t1, dur;
|
|
printf("calibrating fft benchmark duration at size N = 512 ..\n");
|
|
t0 = uclock_sec();
|
|
if (benchReal) {
|
|
iterCalReal = cal_benchmark(512, 0 /* real fft */);
|
|
printf("real fft iterCal = %f\n", iterCalReal);
|
|
}
|
|
if (benchCplx) {
|
|
iterCalCplx = cal_benchmark(512, 1 /* cplx fft */);
|
|
printf("cplx fft iterCal = %f\n", iterCalCplx);
|
|
}
|
|
t1 = uclock_sec();
|
|
dur = t1 - t0;
|
|
printf("calibration done in %f sec.\n\n", dur);
|
|
}
|
|
|
|
if (!array_output_format) {
|
|
if (benchReal) {
|
|
for (i=0; Nvalues[i] > 0; ++i)
|
|
benchmark_ffts(Nvalues[i], 0 /* real fft */, withFFTWfullMeas, iterCalReal, tmeas[0][i], haveAlgo, NULL);
|
|
}
|
|
if (benchCplx) {
|
|
for (i=0; Nvalues[i] > 0; ++i)
|
|
benchmark_ffts(Nvalues[i], 1 /* cplx fft */, withFFTWfullMeas, iterCalCplx, tmeas[1][i], haveAlgo, NULL);
|
|
}
|
|
|
|
} else {
|
|
|
|
if (outputTable2File) {
|
|
tableFile = fopen( usePow2 ? "bench-fft-table-pow2.txt" : "bench-fft-table-non2.txt", "w");
|
|
}
|
|
/* print table headers */
|
|
printf("table shows MFlops; higher values indicate faster computation\n\n");
|
|
|
|
{
|
|
print_table("| input len ", tableFile);
|
|
for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx)
|
|
{
|
|
if ( (realCplxIdx == 0 && !benchReal) || (realCplxIdx == 1 && !benchCplx) )
|
|
continue;
|
|
for (k=0; k < NUM_FFT_ALGOS; ++k)
|
|
{
|
|
if ( compiledInAlgo[k] )
|
|
print_table(algoTableHeader[k][realCplxIdx], tableFile);
|
|
}
|
|
}
|
|
print_table("|\n", tableFile);
|
|
}
|
|
/* print table value seperators */
|
|
{
|
|
print_table("|----------", tableFile);
|
|
for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx)
|
|
{
|
|
if ( (realCplxIdx == 0 && !benchReal) || (realCplxIdx == 1 && !benchCplx) )
|
|
continue;
|
|
for (k=0; k < NUM_FFT_ALGOS; ++k)
|
|
{
|
|
if ( compiledInAlgo[k] )
|
|
print_table(":|-------------", tableFile);
|
|
}
|
|
}
|
|
print_table(":|\n", tableFile);
|
|
}
|
|
|
|
for (i=0; Nvalues[i] > 0; ++i) {
|
|
{
|
|
double t0, t1;
|
|
print_table_fftsize(Nvalues[i], tableFile);
|
|
t0 = uclock_sec();
|
|
if (benchReal)
|
|
benchmark_ffts(Nvalues[i], 0, withFFTWfullMeas, iterCalReal, tmeas[0][i], haveAlgo, tableFile);
|
|
if (benchCplx)
|
|
benchmark_ffts(Nvalues[i], 1, withFFTWfullMeas, iterCalCplx, tmeas[1][i], haveAlgo, tableFile);
|
|
t1 = uclock_sec();
|
|
print_table("|\n", tableFile);
|
|
/* printf("all ffts for size %d took %f sec\n", Nvalues[i], t1-t0); */
|
|
(void)t0;
|
|
(void)t1;
|
|
}
|
|
}
|
|
fprintf(stdout, " (numbers are given in MFlops)\n");
|
|
if (outputTable2File) {
|
|
fclose(tableFile);
|
|
}
|
|
}
|
|
|
|
printf("\n");
|
|
printf("now writing .csv files ..\n");
|
|
|
|
for (realCplxIdx = 0; realCplxIdx < 2; ++realCplxIdx)
|
|
{
|
|
if ( (benchReal && realCplxIdx == 0) || (benchCplx && realCplxIdx == 1) )
|
|
{
|
|
for (typeIdx = 0; typeIdx < NUM_TYPES; ++typeIdx)
|
|
{
|
|
FILE *f = NULL;
|
|
if ( !(SAVE_ALL_TYPES || saveType[typeIdx]) )
|
|
continue;
|
|
acCsvFilename[0] = 0;
|
|
#ifdef PFFFT_SIMD_DISABLE
|
|
strcat(acCsvFilename, "scal-");
|
|
#else
|
|
strcat(acCsvFilename, "simd-");
|
|
#endif
|
|
strcat(acCsvFilename, (realCplxIdx == 0 ? "real-" : "cplx-"));
|
|
strcat(acCsvFilename, ( usePow2 ? "pow2-" : "non2-"));
|
|
assert( strlen(acCsvFilename) + strlen(typeFilenamePart[typeIdx]) + 5 < (sizeof(acCsvFilename) / sizeof(acCsvFilename[0])) );
|
|
strcat(acCsvFilename, typeFilenamePart[typeIdx]);
|
|
strcat(acCsvFilename, ".csv");
|
|
f = fopen(acCsvFilename, "w");
|
|
if (!f)
|
|
continue;
|
|
{
|
|
fprintf(f, "size, log2, ");
|
|
for (k=0; k < NUM_FFT_ALGOS; ++k)
|
|
if ( haveAlgo[k] )
|
|
fprintf(f, "%s, ", algoName[k]);
|
|
fprintf(f, "\n");
|
|
}
|
|
for (i=0; Nvalues[i] > 0; ++i)
|
|
{
|
|
{
|
|
fprintf(f, "%d, %.3f, ", Nvalues[i], log10((double)Nvalues[i])/log10(2.0) );
|
|
for (k=0; k < NUM_FFT_ALGOS; ++k)
|
|
if ( haveAlgo[k] )
|
|
fprintf(f, "%f, ", tmeas[realCplxIdx][i][typeIdx][k]);
|
|
fprintf(f, "\n");
|
|
}
|
|
}
|
|
fclose(f);
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|