You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1129 lines
43 KiB

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "testBase.h"
#include "harness/conversions.h"
#ifndef _WIN32
#include <unistd.h>
#endif
#define INT_TEST_VALUE 402258822
#define LONG_TEST_VALUE 515154531254381446LL
const char *atomic_global_pattern[] = {
"__kernel void test_atomic_fn(volatile __global %s *destMemory, __global %s *oldValues)\n"
"{\n"
" int tid = get_global_id(0);\n"
"\n"
,
"\n"
"}\n" };
const char *atomic_local_pattern[] = {
"__kernel void test_atomic_fn(__global %s *finalDest, __global %s *oldValues, volatile __local %s *destMemory, int numDestItems )\n"
"{\n"
" int tid = get_global_id(0);\n"
" int dstItemIdx;\n"
"\n"
" // Everybody does the following line(s), but it all has the same result. We still need to ensure we sync before the atomic op, though\n"
" for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
" destMemory[ dstItemIdx ] = finalDest[ dstItemIdx ];\n"
" barrier( CLK_LOCAL_MEM_FENCE );\n"
"\n"
,
" barrier( CLK_LOCAL_MEM_FENCE );\n"
" // Finally, write out the last value. Again, we're synced, so everyone will be writing the same value\n"
" for( dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++ )\n"
" finalDest[ dstItemIdx ] = destMemory[ dstItemIdx ];\n"
"}\n" };
#define TEST_COUNT 128 * 1024
struct TestFns
{
cl_int mIntStartValue;
cl_long mLongStartValue;
size_t (*NumResultsFn)( size_t threadSize, ExplicitType dataType );
// Integer versions
cl_int (*ExpectedValueIntFn)( size_t size, cl_int *startRefValues, size_t whichDestValue );
void (*GenerateRefsIntFn)( size_t size, cl_int *startRefValues, MTdata d );
bool (*VerifyRefsIntFn)( size_t size, cl_int *refValues, cl_int finalValue );
// Long versions
cl_long (*ExpectedValueLongFn)( size_t size, cl_long *startRefValues, size_t whichDestValue );
void (*GenerateRefsLongFn)( size_t size, cl_long *startRefValues, MTdata d );
bool (*VerifyRefsLongFn)( size_t size, cl_long *refValues, cl_long finalValue );
// Float versions
cl_float (*ExpectedValueFloatFn)( size_t size, cl_float *startRefValues, size_t whichDestValue );
void (*GenerateRefsFloatFn)( size_t size, cl_float *startRefValues, MTdata d );
bool (*VerifyRefsFloatFn)( size_t size, cl_float *refValues, cl_float finalValue );
};
bool check_atomic_support( cl_device_id device, bool extended, bool isLocal, ExplicitType dataType )
{
const char *extensionNames[8] = {
"cl_khr_global_int32_base_atomics", "cl_khr_global_int32_extended_atomics",
"cl_khr_local_int32_base_atomics", "cl_khr_local_int32_extended_atomics",
"cl_khr_int64_base_atomics", "cl_khr_int64_extended_atomics",
"cl_khr_int64_base_atomics", "cl_khr_int64_extended_atomics" // this line intended to be the same as the last one
};
size_t index = 0;
if( extended )
index += 1;
if( isLocal )
index += 2;
Version version = get_device_cl_version(device);
switch (dataType)
{
case kInt:
case kUInt:
if( version >= Version(1,1) )
return 1;
break;
case kLong:
case kULong:
index += 4;
break;
case kFloat: // this has to stay separate since the float atomics arent in the 1.0 extensions
return version >= Version(1,1);
default:
log_error( "ERROR: Unsupported data type (%d) in check_atomic_support\n", dataType );
return 0;
}
return is_extension_available( device, extensionNames[index] );
}
int test_atomic_function(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, const char *programCore,
TestFns testFns,
bool extended, bool isLocal, ExplicitType dataType, bool matchGroupSize )
{
clProgramWrapper program;
clKernelWrapper kernel;
int error;
size_t threads[1];
clMemWrapper streams[2];
void *refValues, *startRefValues;
size_t threadSize, groupSize;
const char *programLines[4];
char pragma[ 512 ];
char programHeader[ 512 ];
MTdata d;
size_t typeSize = get_explicit_type_size( dataType );
// Verify we can run first
bool isUnsigned = ( dataType == kULong ) || ( dataType == kUInt );
if( !check_atomic_support( deviceID, extended, isLocal, dataType ) )
{
// Only print for the signed (unsigned comes right after, and if signed isn't supported, unsigned isn't either)
if( dataType == kFloat )
log_info( "\t%s float not supported\n", isLocal ? "Local" : "Global" );
else if( !isUnsigned )
log_info( "\t%s %sint%d not supported\n", isLocal ? "Local" : "Global", isUnsigned ? "u" : "", (int)typeSize * 8 );
// Since we don't support the operation, they implicitly pass
return 0;
}
else
{
if( dataType == kFloat )
log_info( "\t%s float%s...", isLocal ? "local" : "global", isLocal ? " " : "" );
else
log_info( "\t%s %sint%d%s%s...", isLocal ? "local" : "global", isUnsigned ? "u" : "",
(int)typeSize * 8, isUnsigned ? "" : " ", isLocal ? " " : "" );
}
//// Set up the kernel code
// Create the pragma line for this kernel
bool isLong = ( dataType == kLong || dataType == kULong );
sprintf( pragma, "#pragma OPENCL EXTENSION cl_khr%s_int%s_%s_atomics : enable\n",
isLong ? "" : (isLocal ? "_local" : "_global"), isLong ? "64" : "32",
extended ? "extended" : "base" );
// Now create the program header
const char *typeName = get_explicit_type_name( dataType );
if( isLocal )
sprintf( programHeader, atomic_local_pattern[ 0 ], typeName, typeName, typeName );
else
sprintf( programHeader, atomic_global_pattern[ 0 ], typeName, typeName );
// Set up our entire program now
programLines[ 0 ] = pragma;
programLines[ 1 ] = programHeader;
programLines[ 2 ] = programCore;
programLines[ 3 ] = ( isLocal ) ? atomic_local_pattern[ 1 ] : atomic_global_pattern[ 1 ];
if( create_single_kernel_helper( context, &program, &kernel, 4, programLines, "test_atomic_fn" ) )
{
return -1;
}
//// Set up to actually run
threadSize = num_elements;
error = get_max_common_work_group_size( context, kernel, threadSize, &groupSize );
test_error( error, "Unable to get thread group max size" );
if( matchGroupSize )
// HACK because xchg and cmpxchg apparently are limited by hardware
threadSize = groupSize;
if( isLocal )
{
size_t maxSizes[3] = {0, 0, 0};
error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_ITEM_SIZES, 3*sizeof(size_t), maxSizes, 0);
test_error( error, "Unable to obtain max work item sizes for the device" );
size_t workSize;
error = clGetKernelWorkGroupInfo( kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof( workSize ), &workSize, NULL );
test_error( error, "Unable to obtain max work group size for device and kernel combo" );
// "workSize" is limited to that of the first dimension as only a 1DRange is executed.
if( maxSizes[0] < workSize )
{
workSize = maxSizes[0];
}
threadSize = groupSize = workSize;
}
log_info( "\t(thread count %d, group size %d)\n", (int)threadSize, (int)groupSize );
refValues = (cl_int *)malloc( typeSize * threadSize );
if( testFns.GenerateRefsIntFn != NULL )
{
// We have a ref generator provided
d = init_genrand( gRandomSeed );
startRefValues = malloc( typeSize * threadSize );
if( typeSize == 4 )
testFns.GenerateRefsIntFn( threadSize, (cl_int *)startRefValues, d );
else
testFns.GenerateRefsLongFn( threadSize, (cl_long *)startRefValues, d );
free_mtdata(d);
d = NULL;
}
else
startRefValues = NULL;
// If we're given a num_results function, we need to determine how many result objects we need. If
// we don't have it, we assume it's just 1
size_t numDestItems = ( testFns.NumResultsFn != NULL ) ? testFns.NumResultsFn( threadSize, dataType ) : 1;
char * destItems = new char[ typeSize * numDestItems ];
if( destItems == NULL )
{
log_error( "ERROR: Unable to allocate memory!\n" );
return -1;
}
void * startValue = ( typeSize == 4 ) ? (void *)&testFns.mIntStartValue : (void *)&testFns.mLongStartValue;
for( size_t i = 0; i < numDestItems; i++ )
memcpy( destItems + i * typeSize, startValue, typeSize );
streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
typeSize * numDestItems, destItems, NULL);
if (!streams[0])
{
log_error("ERROR: Creating output array failed!\n");
return -1;
}
streams[1] = clCreateBuffer(
context,
((startRefValues != NULL ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
typeSize * threadSize, startRefValues, NULL);
if (!streams[1])
{
log_error("ERROR: Creating reference array failed!\n");
return -1;
}
/* Set the arguments */
error = clSetKernelArg( kernel, 0, sizeof( streams[0] ), &streams[0] );
test_error( error, "Unable to set indexed kernel arguments" );
error = clSetKernelArg( kernel, 1, sizeof( streams[1] ), &streams[1] );
test_error( error, "Unable to set indexed kernel arguments" );
if( isLocal )
{
error = clSetKernelArg( kernel, 2, typeSize * numDestItems, NULL );
test_error( error, "Unable to set indexed local kernel argument" );
cl_int numDestItemsInt = (cl_int)numDestItems;
error = clSetKernelArg( kernel, 3, sizeof( cl_int ), &numDestItemsInt );
test_error( error, "Unable to set indexed kernel argument" );
}
/* Run the kernel */
threads[0] = threadSize;
error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, &groupSize, 0, NULL, NULL );
test_error( error, "Unable to execute test kernel" );
error = clEnqueueReadBuffer( queue, streams[0], true, 0, typeSize * numDestItems, destItems, 0, NULL, NULL );
test_error( error, "Unable to read result value!" );
error = clEnqueueReadBuffer( queue, streams[1], true, 0, typeSize * threadSize, refValues, 0, NULL, NULL );
test_error( error, "Unable to read reference values!" );
// If we have an expectedFn, then we need to generate a final value to compare against. If we don't
// have one, it's because we're comparing ref values only
if( testFns.ExpectedValueIntFn != NULL )
{
for( size_t i = 0; i < numDestItems; i++ )
{
char expected[ 8 ];
cl_int intVal;
cl_long longVal;
if( typeSize == 4 )
{
// Int version
intVal = testFns.ExpectedValueIntFn( threadSize, (cl_int *)startRefValues, i );
memcpy( expected, &intVal, sizeof( intVal ) );
}
else
{
// Long version
longVal = testFns.ExpectedValueLongFn( threadSize, (cl_long *)startRefValues, i );
memcpy( expected, &longVal, sizeof( longVal ) );
}
if( memcmp( expected, destItems + i * typeSize, typeSize ) != 0 )
{
if( typeSize == 4 )
{
cl_int *outValue = (cl_int *)( destItems + i * typeSize );
log_error( "ERROR: Result %ld from kernel does not validate! (should be %d, was %d)\n", i, intVal, *outValue );
cl_int *startRefs = (cl_int *)startRefValues;
cl_int *refs = (cl_int *)refValues;
for( i = 0; i < threadSize; i++ )
{
if( startRefs != NULL )
log_info( " --- %ld - %d --- %d\n", i, startRefs[i], refs[i] );
else
log_info( " --- %ld --- %d\n", i, refs[i] );
}
}
else
{
cl_long *outValue = (cl_long *)( destItems + i * typeSize );
log_error( "ERROR: Result %ld from kernel does not validate! (should be %lld, was %lld)\n", i, longVal, *outValue );
cl_long *startRefs = (cl_long *)startRefValues;
cl_long *refs = (cl_long *)refValues;
for( i = 0; i < threadSize; i++ )
{
if( startRefs != NULL )
log_info( " --- %ld - %lld --- %lld\n", i, startRefs[i], refs[i] );
else
log_info( " --- %ld --- %lld\n", i, refs[i] );
}
}
return -1;
}
}
}
if( testFns.VerifyRefsIntFn != NULL )
{
/* Use the verify function to also check the results */
if( dataType == kFloat )
{
cl_float *outValue = (cl_float *)destItems;
if( !testFns.VerifyRefsFloatFn( threadSize, (cl_float *)refValues, *outValue ) != 0 )
{
log_error( "ERROR: Reference values did not validate!\n" );
return -1;
}
}
else if( typeSize == 4 )
{
cl_int *outValue = (cl_int *)destItems;
if( !testFns.VerifyRefsIntFn( threadSize, (cl_int *)refValues, *outValue ) != 0 )
{
log_error( "ERROR: Reference values did not validate!\n" );
return -1;
}
}
else
{
cl_long *outValue = (cl_long *)destItems;
if( !testFns.VerifyRefsLongFn( threadSize, (cl_long *)refValues, *outValue ) != 0 )
{
log_error( "ERROR: Reference values did not validate!\n" );
return -1;
}
}
}
else if( testFns.ExpectedValueIntFn == NULL )
{
log_error( "ERROR: Test doesn't check total or refs; no values are verified!\n" );
return -1;
}
/* Re-write the starting value */
for( size_t i = 0; i < numDestItems; i++ )
memcpy( destItems + i * typeSize, startValue, typeSize );
error = clEnqueueWriteBuffer( queue, streams[0], true, 0, typeSize * numDestItems, destItems, 0, NULL, NULL );
test_error( error, "Unable to write starting values!" );
/* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
threads[0] = 1;
error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, threads, 0, NULL, NULL );
test_error( error, "Unable to execute test kernel" );
error = clEnqueueReadBuffer( queue, streams[1], true, 0, typeSize, refValues, 0, NULL, NULL );
test_error( error, "Unable to read reference values!" );
if( memcmp( refValues, destItems, typeSize ) != 0 )
{
if( typeSize == 4 )
{
cl_int *s = (cl_int *)destItems;
cl_int *r = (cl_int *)refValues;
log_error( "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
" (should have been %d, returned %d)!\n", *s, *r );
}
else
{
cl_long *s = (cl_long *)destItems;
cl_long *r = (cl_long *)refValues;
log_error( "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
" (should have been %lld, returned %lld)!\n", *s, *r );
}
return -1;
}
delete [] destItems;
free( refValues );
if( startRefValues != NULL )
free( startRefValues );
return 0;
}
int test_atomic_function_set(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, const char *programCore,
TestFns testFns,
bool extended, bool matchGroupSize, bool usingAtomicPrefix )
{
log_info(" Testing %s functions...\n", usingAtomicPrefix ? "atomic_" : "atom_");
int errors = 0;
errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kInt, matchGroupSize );
errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kUInt, matchGroupSize );
errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kInt, matchGroupSize );
errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kUInt, matchGroupSize );
// Only the 32 bit atomic functions use the "atomic" prefix in 1.1, the 64 bit functions still use the "atom" prefix.
// The argument usingAtomicPrefix is set to true if programCore was generated with the "atomic" prefix.
if (!usingAtomicPrefix) {
errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kLong, matchGroupSize );
errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, false, kULong, matchGroupSize );
errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kLong, matchGroupSize );
errors |= test_atomic_function( deviceID, context, queue, num_elements, programCore, testFns, extended, true, kULong, matchGroupSize );
}
return errors;
}
#pragma mark ---- add
const char atom_add_core[] =
" oldValues[tid] = atom_add( &destMemory[0], tid + 3 );\n"
" atom_add( &destMemory[0], tid + 3 );\n"
" atom_add( &destMemory[0], tid + 3 );\n"
" atom_add( &destMemory[0], tid + 3 );\n";
const char atomic_add_core[] =
" oldValues[tid] = atomic_add( &destMemory[0], tid + 3 );\n"
" atomic_add( &destMemory[0], tid + 3 );\n"
" atomic_add( &destMemory[0], tid + 3 );\n"
" atomic_add( &destMemory[0], tid + 3 );\n";
cl_int test_atomic_add_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
cl_int total = 0;
for( size_t i = 0; i < size; i++ )
total += ( (cl_int)i + 3 ) * 4;
return total;
}
cl_long test_atomic_add_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
cl_long total = 0;
for( size_t i = 0; i < size; i++ )
total += ( ( i + 3 ) * 4 );
return total;
}
int test_atomic_add(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
TestFns set = { 0, 0LL, NULL, test_atomic_add_result_int, NULL, NULL, test_atomic_add_result_long, NULL, NULL };
if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_add_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
return -1;
if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_add_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
return -1;
return 0;
}
#pragma mark ---- sub
const char atom_sub_core[] = " oldValues[tid] = atom_sub( &destMemory[0], tid + 3 );\n";
const char atomic_sub_core[] = " oldValues[tid] = atomic_sub( &destMemory[0], tid + 3 );\n";
cl_int test_atomic_sub_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
cl_int total = INT_TEST_VALUE;
for( size_t i = 0; i < size; i++ )
total -= (cl_int)i + 3;
return total;
}
cl_long test_atomic_sub_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
cl_long total = LONG_TEST_VALUE;
for( size_t i = 0; i < size; i++ )
total -= i + 3;
return total;
}
int test_atomic_sub(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_sub_result_int, NULL, NULL, test_atomic_sub_result_long, NULL, NULL };
if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_sub_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
return -1;
if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_sub_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
return -1;
return 0;
}
#pragma mark ---- xchg
const char atom_xchg_core[] = " oldValues[tid] = atom_xchg( &destMemory[0], tid );\n";
const char atomic_xchg_core[] = " oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
const char atomic_xchg_float_core[] = " oldValues[tid] = atomic_xchg( &destMemory[0], tid );\n";
bool test_atomic_xchg_verify_int( size_t size, cl_int *refValues, cl_int finalValue )
{
/* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
char *valids;
size_t i;
char originalValidCount = 0;
valids = (char *)malloc( sizeof( char ) * size );
memset( valids, 0, sizeof( char ) * size );
for( i = 0; i < size; i++ )
{
if( refValues[ i ] == INT_TEST_VALUE )
{
// Special initial value
originalValidCount++;
continue;
}
if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
{
log_error( "ERROR: Reference value %ld outside of valid range! (%d)\n", i, refValues[ i ] );
return false;
}
valids[ refValues[ i ] ] ++;
}
/* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
the final value outputted */
if( valids[ finalValue ] > 0 )
{
log_error( "ERROR: Final value %d was also in ref list!\n", finalValue );
return false;
}
else
valids[ finalValue ] = 1; // So the following loop will be okay
/* Now check that every entry has one and only one count */
if( originalValidCount != 1 )
{
log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
return false;
}
for( i = 0; i < size; i++ )
{
if( valids[ i ] != 1 )
{
log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
for( size_t j = 0; j < size; j++ )
log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
return false;
}
}
free( valids );
return true;
}
bool test_atomic_xchg_verify_long( size_t size, cl_long *refValues, cl_long finalValue )
{
/* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
char *valids;
size_t i;
char originalValidCount = 0;
valids = (char *)malloc( sizeof( char ) * size );
memset( valids, 0, sizeof( char ) * size );
for( i = 0; i < size; i++ )
{
if( refValues[ i ] == LONG_TEST_VALUE )
{
// Special initial value
originalValidCount++;
continue;
}
if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
{
log_error( "ERROR: Reference value %ld outside of valid range! (%lld)\n", i, refValues[ i ] );
return false;
}
valids[ refValues[ i ] ] ++;
}
/* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
the final value outputted */
if( valids[ finalValue ] > 0 )
{
log_error( "ERROR: Final value %lld was also in ref list!\n", finalValue );
return false;
}
else
valids[ finalValue ] = 1; // So the following loop will be okay
/* Now check that every entry has one and only one count */
if( originalValidCount != 1 )
{
log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
return false;
}
for( i = 0; i < size; i++ )
{
if( valids[ i ] != 1 )
{
log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
for( size_t j = 0; j < size; j++ )
log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
return false;
}
}
free( valids );
return true;
}
bool test_atomic_xchg_verify_float( size_t size, cl_float *refValues, cl_float finalValue )
{
/* For xchg, each value from 0 to size - 1 should have an entry in the ref array, and ONLY one entry */
char *valids;
size_t i;
char originalValidCount = 0;
valids = (char *)malloc( sizeof( char ) * size );
memset( valids, 0, sizeof( char ) * size );
for( i = 0; i < size; i++ )
{
cl_int *intRefValue = (cl_int *)( &refValues[ i ] );
if( *intRefValue == INT_TEST_VALUE )
{
// Special initial value
originalValidCount++;
continue;
}
if( refValues[ i ] < 0 || (size_t)refValues[ i ] >= size )
{
log_error( "ERROR: Reference value %ld outside of valid range! (%a)\n", i, refValues[ i ] );
return false;
}
valids[ (int)refValues[ i ] ] ++;
}
/* Note: ONE entry will have zero count. It'll be the last one that executed, because that value should be
the final value outputted */
if( valids[ (int)finalValue ] > 0 )
{
log_error( "ERROR: Final value %a was also in ref list!\n", finalValue );
return false;
}
else
valids[ (int)finalValue ] = 1; // So the following loop will be okay
/* Now check that every entry has one and only one count */
if( originalValidCount != 1 )
{
log_error( "ERROR: Starting reference value %d did not occur once-and-only-once (occurred %d)\n", 65191, originalValidCount );
return false;
}
for( i = 0; i < size; i++ )
{
if( valids[ i ] != 1 )
{
log_error( "ERROR: Reference value %ld did not occur once-and-only-once (occurred %d)\n", i, valids[ i ] );
for( size_t j = 0; j < size; j++ )
log_info( "%d: %d\n", (int)j, (int)valids[ j ] );
return false;
}
}
free( valids );
return true;
}
int test_atomic_xchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, NULL, NULL, test_atomic_xchg_verify_int, NULL, NULL, test_atomic_xchg_verify_long, NULL, NULL, test_atomic_xchg_verify_float };
int errors = test_atomic_function_set( deviceID, context, queue, num_elements, atom_xchg_core, set, false, true, /*usingAtomicPrefix*/ false );
errors |= test_atomic_function_set( deviceID, context, queue, num_elements, atomic_xchg_core, set, false, true, /*usingAtomicPrefix*/ true );
errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_xchg_float_core, set, false, false, kFloat, true );
errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_xchg_float_core, set, false, true, kFloat, true );
return errors;
}
#pragma mark ---- min
const char atom_min_core[] = " oldValues[tid] = atom_min( &destMemory[0], oldValues[tid] );\n";
const char atomic_min_core[] = " oldValues[tid] = atomic_min( &destMemory[0], oldValues[tid] );\n";
cl_int test_atomic_min_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
cl_int total = 0x7fffffffL;
for( size_t i = 0; i < size; i++ )
{
if( startRefValues[ i ] < total )
total = startRefValues[ i ];
}
return total;
}
void test_atomic_min_gen_int( size_t size, cl_int *startRefValues, MTdata d )
{
for( size_t i = 0; i < size; i++ )
startRefValues[i] = (cl_int)( genrand_int32(d) % 0x3fffffff ) + 0x3fffffff;
}
cl_long test_atomic_min_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
cl_long total = 0x7fffffffffffffffLL;
for( size_t i = 0; i < size; i++ )
{
if( startRefValues[ i ] < total )
total = startRefValues[ i ];
}
return total;
}
void test_atomic_min_gen_long( size_t size, cl_long *startRefValues, MTdata d )
{
for( size_t i = 0; i < size; i++ )
startRefValues[i] = (cl_long)( genrand_int32(d) | ( ( (cl_long)genrand_int32(d) & 0x7fffffffL ) << 16 ) );
}
int test_atomic_min(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
TestFns set = { 0x7fffffffL, 0x7fffffffffffffffLL, NULL, test_atomic_min_result_int, test_atomic_min_gen_int, NULL, test_atomic_min_result_long, test_atomic_min_gen_long, NULL };
if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_min_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
return -1;
if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_min_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
return -1;
return 0;
}
#pragma mark ---- max
const char atom_max_core[] = " oldValues[tid] = atom_max( &destMemory[0], oldValues[tid] );\n";
const char atomic_max_core[] = " oldValues[tid] = atomic_max( &destMemory[0], oldValues[tid] );\n";
cl_int test_atomic_max_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
cl_int total = 0;
for( size_t i = 0; i < size; i++ )
{
if( startRefValues[ i ] > total )
total = startRefValues[ i ];
}
return total;
}
void test_atomic_max_gen_int( size_t size, cl_int *startRefValues, MTdata d )
{
for( size_t i = 0; i < size; i++ )
startRefValues[i] = (cl_int)( genrand_int32(d) % 0x3fffffff ) + 0x3fffffff;
}
cl_long test_atomic_max_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
cl_long total = 0;
for( size_t i = 0; i < size; i++ )
{
if( startRefValues[ i ] > total )
total = startRefValues[ i ];
}
return total;
}
void test_atomic_max_gen_long( size_t size, cl_long *startRefValues, MTdata d )
{
for( size_t i = 0; i < size; i++ )
startRefValues[i] = (cl_long)( genrand_int32(d) | ( ( (cl_long)genrand_int32(d) & 0x7fffffffL ) << 16 ) );
}
int test_atomic_max(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
TestFns set = { 0, 0, NULL, test_atomic_max_result_int, test_atomic_max_gen_int, NULL, test_atomic_max_result_long, test_atomic_max_gen_long, NULL };
if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_max_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
return -1;
if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_max_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
return -1;
return 0;
}
#pragma mark ---- inc
const char atom_inc_core[] = " oldValues[tid] = atom_inc( &destMemory[0] );\n";
const char atomic_inc_core[] = " oldValues[tid] = atomic_inc( &destMemory[0] );\n";
cl_int test_atomic_inc_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
return INT_TEST_VALUE + (cl_int)size;
}
cl_long test_atomic_inc_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
return LONG_TEST_VALUE + size;
}
int test_atomic_inc(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_inc_result_int, NULL, NULL, test_atomic_inc_result_long, NULL, NULL };
if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_inc_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
return -1;
if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_inc_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
return -1;
return 0;
}
#pragma mark ---- dec
const char atom_dec_core[] = " oldValues[tid] = atom_dec( &destMemory[0] );\n";
const char atomic_dec_core[] = " oldValues[tid] = atomic_dec( &destMemory[0] );\n";
cl_int test_atomic_dec_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
return INT_TEST_VALUE - (cl_int)size;
}
cl_long test_atomic_dec_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
return LONG_TEST_VALUE - size;
}
int test_atomic_dec(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_dec_result_int, NULL, NULL, test_atomic_dec_result_long, NULL, NULL };
if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_dec_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
return -1;
if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_dec_core, set, false, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
return -1;
return 0;
}
#pragma mark ---- cmpxchg
/* We test cmpxchg by implementing (the long way) atom_add */
const char atom_cmpxchg_core[] =
" int oldValue, origValue, newValue;\n"
" do { \n"
" origValue = destMemory[0];\n"
" newValue = origValue + tid + 2;\n"
" oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
" } while( oldValue != origValue );\n"
" oldValues[tid] = oldValue;\n"
;
const char atom_cmpxchg64_core[] =
" long oldValue, origValue, newValue;\n"
" do { \n"
" origValue = destMemory[0];\n"
" newValue = origValue + tid + 2;\n"
" oldValue = atom_cmpxchg( &destMemory[0], origValue, newValue );\n"
" } while( oldValue != origValue );\n"
" oldValues[tid] = oldValue;\n"
;
const char atomic_cmpxchg_core[] =
" int oldValue, origValue, newValue;\n"
" do { \n"
" origValue = destMemory[0];\n"
" newValue = origValue + tid + 2;\n"
" oldValue = atomic_cmpxchg( &destMemory[0], origValue, newValue );\n"
" } while( oldValue != origValue );\n"
" oldValues[tid] = oldValue;\n"
;
cl_int test_atomic_cmpxchg_result_int( size_t size, cl_int *startRefValues, size_t whichDestValue )
{
cl_int total = INT_TEST_VALUE;
for( size_t i = 0; i < size; i++ )
total += (cl_int)i + 2;
return total;
}
cl_long test_atomic_cmpxchg_result_long( size_t size, cl_long *startRefValues, size_t whichDestValue )
{
cl_long total = LONG_TEST_VALUE;
for( size_t i = 0; i < size; i++ )
total += i + 2;
return total;
}
int test_atomic_cmpxchg(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
TestFns set = { INT_TEST_VALUE, LONG_TEST_VALUE, NULL, test_atomic_cmpxchg_result_int, NULL, NULL, test_atomic_cmpxchg_result_long, NULL, NULL };
int errors = 0;
log_info(" Testing atom_ functions...\n");
errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, false, kInt, true );
errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, false, kUInt, true );
errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, true, kInt, true );
errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg_core, set, false, true, kUInt, true );
errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, false, kLong, true );
errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, false, kULong, true );
errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, true, kLong, true );
errors |= test_atomic_function( deviceID, context, queue, num_elements, atom_cmpxchg64_core, set, false, true, kULong, true );
log_info(" Testing atomic_ functions...\n");
errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, false, kInt, true );
errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, false, kUInt, true );
errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, true, kInt, true );
errors |= test_atomic_function( deviceID, context, queue, num_elements, atomic_cmpxchg_core, set, false, true, kUInt, true );
if( errors )
return -1;
return 0;
}
#pragma mark -------- Bitwise functions
size_t test_bitwise_num_results( size_t threadCount, ExplicitType dataType )
{
size_t numBits = get_explicit_type_size( dataType ) * 8;
return ( threadCount + numBits - 1 ) / numBits;
}
#pragma mark ---- and
const char atom_and_core[] =
" size_t numBits = sizeof( destMemory[0] ) * 8;\n"
" int whichResult = tid / numBits;\n"
" int bitIndex = tid - ( whichResult * numBits );\n"
"\n"
" oldValues[tid] = atom_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n"
;
const char atomic_and_core[] =
" size_t numBits = sizeof( destMemory[0] ) * 8;\n"
" int whichResult = tid / numBits;\n"
" int bitIndex = tid - ( whichResult * numBits );\n"
"\n"
" oldValues[tid] = atomic_and( &destMemory[whichResult], ~( 1L << bitIndex ) );\n"
;
cl_int test_atomic_and_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
{
size_t numThreads = ( (size_t)size + 31 ) / 32;
if( whichResult < numThreads - 1 )
return 0;
// Last item doesn't get and'ed on every bit, so we have to mask away
size_t numBits = (size_t)size - whichResult * 32;
cl_int bits = (cl_int)0xffffffffL;
for( size_t i = 0; i < numBits; i++ )
bits &= ~( 1 << i );
return bits;
}
cl_long test_atomic_and_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
{
size_t numThreads = ( (size_t)size + 63 ) / 64;
if( whichResult < numThreads - 1 )
return 0;
// Last item doesn't get and'ed on every bit, so we have to mask away
size_t numBits = (size_t)size - whichResult * 64;
cl_long bits = (cl_long)0xffffffffffffffffLL;
for (size_t i = 0; i < numBits; i++) bits &= ~(1LL << i);
return bits;
}
int test_atomic_and(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
TestFns set = { 0xffffffff, 0xffffffffffffffffLL, test_bitwise_num_results,
test_atomic_and_result_int, NULL, NULL, test_atomic_and_result_long, NULL, NULL };
if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_and_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
return -1;
if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_and_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
return -1;
return 0;
}
#pragma mark ---- or
const char atom_or_core[] =
" size_t numBits = sizeof( destMemory[0] ) * 8;\n"
" int whichResult = tid / numBits;\n"
" int bitIndex = tid - ( whichResult * numBits );\n"
"\n"
" oldValues[tid] = atom_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n"
;
const char atomic_or_core[] =
" size_t numBits = sizeof( destMemory[0] ) * 8;\n"
" int whichResult = tid / numBits;\n"
" int bitIndex = tid - ( whichResult * numBits );\n"
"\n"
" oldValues[tid] = atomic_or( &destMemory[whichResult], ( 1L << bitIndex ) );\n"
;
cl_int test_atomic_or_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
{
size_t numThreads = ( (size_t)size + 31 ) / 32;
if( whichResult < numThreads - 1 )
return 0xffffffff;
// Last item doesn't get and'ed on every bit, so we have to mask away
size_t numBits = (size_t)size - whichResult * 32;
cl_int bits = 0;
for( size_t i = 0; i < numBits; i++ )
bits |= ( 1 << i );
return bits;
}
cl_long test_atomic_or_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
{
size_t numThreads = ( (size_t)size + 63 ) / 64;
if( whichResult < numThreads - 1 )
return 0x0ffffffffffffffffLL;
// Last item doesn't get and'ed on every bit, so we have to mask away
size_t numBits = (size_t)size - whichResult * 64;
cl_long bits = 0;
for( size_t i = 0; i < numBits; i++ )
bits |= ( 1LL << i );
return bits;
}
int test_atomic_or(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
TestFns set = { 0, 0LL, test_bitwise_num_results, test_atomic_or_result_int, NULL, NULL, test_atomic_or_result_long, NULL, NULL };
if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_or_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
return -1;
if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_or_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
return -1;
return 0;
}
#pragma mark ---- xor
const char atom_xor_core[] =
" size_t numBits = sizeof( destMemory[0] ) * 8;\n"
" int bitIndex = tid & ( numBits - 1 );\n"
"\n"
" oldValues[tid] = atom_xor( &destMemory[0], 1L << bitIndex );\n";
const char atomic_xor_core[] =
" size_t numBits = sizeof( destMemory[0] ) * 8;\n"
" int bitIndex = tid & ( numBits - 1 );\n"
"\n"
" oldValues[tid] = atomic_xor( &destMemory[0], 1L << bitIndex );\n";
cl_int test_atomic_xor_result_int( size_t size, cl_int *startRefValues, size_t whichResult )
{
cl_int total = 0x2f08ab41;
for( size_t i = 0; i < size; i++ )
total ^= ( 1 << ( i & 31 ) );
return total;
}
cl_long test_atomic_xor_result_long( size_t size, cl_long *startRefValues, size_t whichResult )
{
cl_long total = 0x2f08ab418ba0541LL;
for( size_t i = 0; i < size; i++ )
total ^= ( 1LL << ( i & 63 ) );
return total;
}
int test_atomic_xor(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
{
TestFns set = { 0x2f08ab41, 0x2f08ab418ba0541LL, NULL, test_atomic_xor_result_int, NULL, NULL, test_atomic_xor_result_long, NULL, NULL };
if( test_atomic_function_set( deviceID, context, queue, num_elements, atom_xor_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ false ) != 0 )
return -1;
if( test_atomic_function_set( deviceID, context, queue, num_elements, atomic_xor_core, set, true, /*matchGroupSize*/ false, /*usingAtomicPrefix*/ true ) != 0 )
return -1;
return 0;
}