You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1029 lines
33 KiB
1029 lines
33 KiB
//
|
|
// Copyright (c) 2017 The Khronos Group Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
#include "ThreadPool.h"
|
|
#include "errorHelpers.h"
|
|
#include "fpcontrol.h"
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
#if defined(__APPLE__) || defined(__linux__) || defined(_WIN32)
|
|
// or any other POSIX system
|
|
|
|
#if defined(_WIN32)
|
|
#include <windows.h>
|
|
#if defined(_MSC_VER)
|
|
#include <intrin.h>
|
|
#endif
|
|
#include "mingw_compat.h"
|
|
#include <process.h>
|
|
#else // !_WIN32
|
|
#include <pthread.h>
|
|
#include <unistd.h>
|
|
#include <sys/errno.h>
|
|
#ifdef __linux__
|
|
#include <sched.h>
|
|
#endif
|
|
#endif // !_WIN32
|
|
|
|
// declarations
|
|
#ifdef _WIN32
|
|
void ThreadPool_WorkerFunc(void *p);
|
|
#else
|
|
void *ThreadPool_WorkerFunc(void *p);
|
|
#endif
|
|
void ThreadPool_Init(void);
|
|
void ThreadPool_Exit(void);
|
|
|
|
#if defined(__MINGW32__)
|
|
// Mutex for implementing super heavy atomic operations if you don't have GCC or
|
|
// MSVC
|
|
CRITICAL_SECTION gAtomicLock;
|
|
#elif defined(__GNUC__) || defined(_MSC_VER)
|
|
#else
|
|
pthread_mutex_t gAtomicLock;
|
|
#endif
|
|
|
|
// Atomic add operator with mem barrier. Mem barrier needed to protect state
|
|
// modified by the worker functions.
|
|
cl_int ThreadPool_AtomicAdd(volatile cl_int *a, cl_int b)
|
|
{
|
|
#if defined(__MINGW32__)
|
|
// No atomics on Mingw32
|
|
EnterCriticalSection(&gAtomicLock);
|
|
cl_int old = *a;
|
|
*a = old + b;
|
|
LeaveCriticalSection(&gAtomicLock);
|
|
return old;
|
|
#elif defined(__GNUC__)
|
|
// GCC extension:
|
|
// http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
|
|
return __sync_fetch_and_add(a, b);
|
|
// do we need __sync_synchronize() here, too? GCC docs are unclear whether
|
|
// __sync_fetch_and_add does a synchronize
|
|
#elif defined(_MSC_VER)
|
|
return (cl_int)_InterlockedExchangeAdd((volatile LONG *)a, (LONG)b);
|
|
#else
|
|
#warning Please add a atomic add implementation here, with memory barrier. Fallback code is slow.
|
|
if (pthread_mutex_lock(&gAtomicLock))
|
|
log_error("Atomic operation failed. pthread_mutex_lock(&gAtomicLock) "
|
|
"returned an error\n");
|
|
cl_int old = *a;
|
|
*a = old + b;
|
|
if (pthread_mutex_unlock(&gAtomicLock))
|
|
log_error("Failed to release gAtomicLock. Further atomic operations "
|
|
"may deadlock!\n");
|
|
return old;
|
|
#endif
|
|
}
|
|
|
|
#if defined(_WIN32)
|
|
// Uncomment the following line if Windows XP support is not required.
|
|
// #define HAS_INIT_ONCE_EXECUTE_ONCE 1
|
|
|
|
#if defined(HAS_INIT_ONCE_EXECUTE_ONCE)
|
|
#define _INIT_ONCE INIT_ONCE
|
|
#define _PINIT_ONCE PINIT_ONCE
|
|
#define _InitOnceExecuteOnce InitOnceExecuteOnce
|
|
#else // !HAS_INIT_ONCE_EXECUTE_ONCE
|
|
|
|
typedef volatile LONG _INIT_ONCE;
|
|
typedef _INIT_ONCE *_PINIT_ONCE;
|
|
typedef BOOL(CALLBACK *_PINIT_ONCE_FN)(_PINIT_ONCE, PVOID, PVOID *);
|
|
|
|
#define _INIT_ONCE_UNINITIALIZED 0
|
|
#define _INIT_ONCE_IN_PROGRESS 1
|
|
#define _INIT_ONCE_DONE 2
|
|
|
|
static BOOL _InitOnceExecuteOnce(_PINIT_ONCE InitOnce, _PINIT_ONCE_FN InitFn,
|
|
PVOID Parameter, LPVOID *Context)
|
|
{
|
|
while (*InitOnce != _INIT_ONCE_DONE)
|
|
{
|
|
if (*InitOnce != _INIT_ONCE_IN_PROGRESS
|
|
&& _InterlockedCompareExchange(InitOnce, _INIT_ONCE_IN_PROGRESS,
|
|
_INIT_ONCE_UNINITIALIZED)
|
|
== _INIT_ONCE_UNINITIALIZED)
|
|
{
|
|
InitFn(InitOnce, Parameter, Context);
|
|
*InitOnce = _INIT_ONCE_DONE;
|
|
return TRUE;
|
|
}
|
|
Sleep(1);
|
|
}
|
|
return TRUE;
|
|
}
|
|
#endif // !HAS_INIT_ONCE_EXECUTE_ONCE
|
|
|
|
// Uncomment the following line if Windows XP support is not required.
|
|
// #define HAS_CONDITION_VARIABLE 1
|
|
|
|
#if defined(HAS_CONDITION_VARIABLE)
|
|
#define _CONDITION_VARIABLE CONDITION_VARIABLE
|
|
#define _InitializeConditionVariable InitializeConditionVariable
|
|
#define _SleepConditionVariableCS SleepConditionVariableCS
|
|
#define _WakeAllConditionVariable WakeAllConditionVariable
|
|
#else // !HAS_CONDITION_VARIABLE
|
|
typedef struct
|
|
{
|
|
HANDLE mEvent; // Used to park the thread.
|
|
// Used to protect mWaiters, mGeneration and mReleaseCount:
|
|
CRITICAL_SECTION mLock[1];
|
|
volatile cl_int mWaiters; // Number of threads waiting on this cond var.
|
|
volatile cl_int mGeneration; // Wait generation count.
|
|
volatile cl_int mReleaseCount; // Number of releases to execute before
|
|
// reseting the event.
|
|
} _CONDITION_VARIABLE;
|
|
|
|
typedef _CONDITION_VARIABLE *_PCONDITION_VARIABLE;
|
|
|
|
static void _InitializeConditionVariable(_PCONDITION_VARIABLE cond_var)
|
|
{
|
|
cond_var->mEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
|
|
InitializeCriticalSection(cond_var->mLock);
|
|
cond_var->mWaiters = 0;
|
|
cond_var->mGeneration = 0;
|
|
#if !defined(NDEBUG)
|
|
cond_var->mReleaseCount = 0;
|
|
#endif // !NDEBUG
|
|
}
|
|
|
|
static void _SleepConditionVariableCS(_PCONDITION_VARIABLE cond_var,
|
|
PCRITICAL_SECTION cond_lock,
|
|
DWORD ignored)
|
|
{
|
|
EnterCriticalSection(cond_var->mLock);
|
|
cl_int generation = cond_var->mGeneration;
|
|
++cond_var->mWaiters;
|
|
LeaveCriticalSection(cond_var->mLock);
|
|
LeaveCriticalSection(cond_lock);
|
|
|
|
while (TRUE)
|
|
{
|
|
WaitForSingleObject(cond_var->mEvent, INFINITE);
|
|
EnterCriticalSection(cond_var->mLock);
|
|
BOOL done =
|
|
cond_var->mReleaseCount > 0 && cond_var->mGeneration != generation;
|
|
LeaveCriticalSection(cond_var->mLock);
|
|
if (done)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
EnterCriticalSection(cond_lock);
|
|
EnterCriticalSection(cond_var->mLock);
|
|
if (--cond_var->mReleaseCount == 0)
|
|
{
|
|
ResetEvent(cond_var->mEvent);
|
|
}
|
|
--cond_var->mWaiters;
|
|
LeaveCriticalSection(cond_var->mLock);
|
|
}
|
|
|
|
static void _WakeAllConditionVariable(_PCONDITION_VARIABLE cond_var)
|
|
{
|
|
EnterCriticalSection(cond_var->mLock);
|
|
if (cond_var->mWaiters > 0)
|
|
{
|
|
++cond_var->mGeneration;
|
|
cond_var->mReleaseCount = cond_var->mWaiters;
|
|
SetEvent(cond_var->mEvent);
|
|
}
|
|
LeaveCriticalSection(cond_var->mLock);
|
|
}
|
|
#endif // !HAS_CONDITION_VARIABLE
|
|
#endif // _WIN32
|
|
|
|
#define MAX_COUNT (1 << 29)
|
|
|
|
// Global state to coordinate whether the threads have been launched
|
|
// successfully or not
|
|
#if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
|
|
static _INIT_ONCE threadpool_init_control;
|
|
#elif defined(_WIN32) // MingW of XP
|
|
static int threadpool_init_control;
|
|
#else // Posix platforms
|
|
pthread_once_t threadpool_init_control = PTHREAD_ONCE_INIT;
|
|
#endif
|
|
cl_int threadPoolInitErr = -1; // set to CL_SUCCESS on successful thread launch
|
|
|
|
// critical region lock around ThreadPool_Do. We can only run one ThreadPool_Do
|
|
// at a time, because we are too lazy to set up a queue here, and don't expect
|
|
// to need one.
|
|
#if defined(_WIN32)
|
|
CRITICAL_SECTION gThreadPoolLock[1];
|
|
#else // !_WIN32
|
|
pthread_mutex_t gThreadPoolLock;
|
|
#endif // !_WIN32
|
|
|
|
// Condition variable to park ThreadPool threads when not working
|
|
#if defined(_WIN32)
|
|
CRITICAL_SECTION cond_lock[1];
|
|
_CONDITION_VARIABLE cond_var[1];
|
|
#else // !_WIN32
|
|
pthread_mutex_t cond_lock;
|
|
pthread_cond_t cond_var;
|
|
#endif // !_WIN32
|
|
|
|
// Condition variable state. How many iterations on the function left to run,
|
|
// set to CL_INT_MAX to cause worker threads to exit. Note: this value might
|
|
// go negative.
|
|
volatile cl_int gRunCount = 0;
|
|
|
|
// State that only changes when the threadpool is not working.
|
|
volatile TPFuncPtr gFunc_ptr = NULL;
|
|
volatile void *gUserInfo = NULL;
|
|
volatile cl_int gJobCount = 0;
|
|
|
|
// State that may change while the thread pool is working
|
|
volatile cl_int jobError = CL_SUCCESS; // err code return for the job as a whole
|
|
|
|
// Condition variable to park caller while waiting
|
|
#if defined(_WIN32)
|
|
HANDLE caller_event;
|
|
#else // !_WIN32
|
|
pthread_mutex_t caller_cond_lock;
|
|
pthread_cond_t caller_cond_var;
|
|
#endif // !_WIN32
|
|
|
|
// # of threads intended to be running. Running threads will decrement this
|
|
// as they discover they've run out of work to do.
|
|
volatile cl_int gRunning = 0;
|
|
|
|
// The total number of threads launched.
|
|
volatile cl_int gThreadCount = 0;
|
|
#ifdef _WIN32
|
|
void ThreadPool_WorkerFunc(void *p)
|
|
#else
|
|
void *ThreadPool_WorkerFunc(void *p)
|
|
#endif
|
|
{
|
|
cl_uint threadID = ThreadPool_AtomicAdd((volatile cl_int *)p, 1);
|
|
cl_int item = ThreadPool_AtomicAdd(&gRunCount, -1);
|
|
// log_info( "ThreadPool_WorkerFunc start: gRunning = %d\n", gRunning );
|
|
|
|
while (MAX_COUNT > item)
|
|
{
|
|
cl_int err;
|
|
|
|
// check for more work to do
|
|
if (0 >= item)
|
|
{
|
|
// log_info("Thread %d has run out of work.\n", threadID);
|
|
|
|
// No work to do. Attempt to block waiting for work
|
|
#if defined(_WIN32)
|
|
EnterCriticalSection(cond_lock);
|
|
#else // !_WIN32
|
|
if ((err = pthread_mutex_lock(&cond_lock)))
|
|
{
|
|
log_error(
|
|
"Error %d from pthread_mutex_lock. Worker %d unable to "
|
|
"block waiting for work. ThreadPool_WorkerFunc failed.\n",
|
|
err, threadID);
|
|
goto exit;
|
|
}
|
|
#endif // !_WIN32
|
|
|
|
cl_int remaining = ThreadPool_AtomicAdd(&gRunning, -1);
|
|
// log_info("ThreadPool_WorkerFunc: gRunning = %d\n",
|
|
// remaining - 1);
|
|
if (1 == remaining)
|
|
{ // last thread out signal the main thread to wake up
|
|
#if defined(_WIN32)
|
|
SetEvent(caller_event);
|
|
#else // !_WIN32
|
|
if ((err = pthread_mutex_lock(&caller_cond_lock)))
|
|
{
|
|
log_error("Error %d from pthread_mutex_lock. Unable to "
|
|
"wake caller.\n",
|
|
err);
|
|
goto exit;
|
|
}
|
|
if ((err = pthread_cond_broadcast(&caller_cond_var)))
|
|
{
|
|
log_error(
|
|
"Error %d from pthread_cond_broadcast. Unable to wake "
|
|
"up main thread. ThreadPool_WorkerFunc failed.\n",
|
|
err);
|
|
goto exit;
|
|
}
|
|
if ((err = pthread_mutex_unlock(&caller_cond_lock)))
|
|
{
|
|
log_error("Error %d from pthread_mutex_lock. Unable to "
|
|
"wake caller.\n",
|
|
err);
|
|
goto exit;
|
|
}
|
|
#endif // !_WIN32
|
|
}
|
|
|
|
// loop in case we are woken only to discover that some other thread
|
|
// already did all the work
|
|
while (0 >= item)
|
|
{
|
|
#if defined(_WIN32)
|
|
_SleepConditionVariableCS(cond_var, cond_lock, INFINITE);
|
|
#else // !_WIN32
|
|
if ((err = pthread_cond_wait(&cond_var, &cond_lock)))
|
|
{
|
|
log_error(
|
|
"Error %d from pthread_cond_wait. Unable to block for "
|
|
"waiting for work. ThreadPool_WorkerFunc failed.\n",
|
|
err);
|
|
pthread_mutex_unlock(&cond_lock);
|
|
goto exit;
|
|
}
|
|
#endif // !_WIN32
|
|
|
|
// try again to get a valid item id
|
|
item = ThreadPool_AtomicAdd(&gRunCount, -1);
|
|
if (MAX_COUNT <= item) // exit if we are done
|
|
{
|
|
#if defined(_WIN32)
|
|
LeaveCriticalSection(cond_lock);
|
|
#else // !_WIN32
|
|
pthread_mutex_unlock(&cond_lock);
|
|
#endif // !_WIN32
|
|
goto exit;
|
|
}
|
|
}
|
|
|
|
ThreadPool_AtomicAdd(&gRunning, 1);
|
|
// log_info("Thread %d has found work.\n", threadID);
|
|
|
|
#if defined(_WIN32)
|
|
LeaveCriticalSection(cond_lock);
|
|
#else // !_WIN32
|
|
if ((err = pthread_mutex_unlock(&cond_lock)))
|
|
{
|
|
log_error(
|
|
"Error %d from pthread_mutex_unlock. Unable to block for "
|
|
"waiting for work. ThreadPool_WorkerFunc failed.\n",
|
|
err);
|
|
goto exit;
|
|
}
|
|
#endif // !_WIN32
|
|
}
|
|
|
|
// we have a valid item, so do the work
|
|
// but only if we haven't already encountered an error
|
|
if (CL_SUCCESS == jobError)
|
|
{
|
|
// log_info("Thread %d doing job %d\n", threadID, item - 1);
|
|
|
|
#if defined(__APPLE__) && defined(__arm__)
|
|
// On most platforms which support denorm, default is FTZ off.
|
|
// However, on some hardware where the reference is computed,
|
|
// default might be flush denorms to zero e.g. arm. This creates
|
|
// issues in result verification. Since spec allows the
|
|
// implementation to either flush or not flush denorms to zero, an
|
|
// implementation may choose not be flush i.e. return denorm result
|
|
// whereas reference result may be zero (flushed denorm). Hence we
|
|
// need to disable denorm flushing on host side where reference is
|
|
// being computed to make sure we get non-flushed reference result.
|
|
// If implementation returns flushed result, we correctly take care
|
|
// of that in verification code.
|
|
FPU_mode_type oldMode;
|
|
DisableFTZ(&oldMode);
|
|
#endif
|
|
|
|
// Call the user's function with this item ID
|
|
err = gFunc_ptr(item - 1, threadID, (void *)gUserInfo);
|
|
#if defined(__APPLE__) && defined(__arm__)
|
|
// Restore FP state
|
|
RestoreFPState(&oldMode);
|
|
#endif
|
|
|
|
if (err)
|
|
{
|
|
#if (__MINGW32__)
|
|
EnterCriticalSection(&gAtomicLock);
|
|
if (jobError == CL_SUCCESS) jobError = err;
|
|
gRunCount = 0;
|
|
LeaveCriticalSection(&gAtomicLock);
|
|
#elif defined(__GNUC__)
|
|
// GCC extension:
|
|
// http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
|
|
// set the new error if we are the first one there.
|
|
__sync_val_compare_and_swap(&jobError, CL_SUCCESS, err);
|
|
|
|
// drop run count to 0
|
|
gRunCount = 0;
|
|
__sync_synchronize();
|
|
#elif defined(_MSC_VER)
|
|
// set the new error if we are the first one there.
|
|
_InterlockedCompareExchange((volatile LONG *)&jobError, err,
|
|
CL_SUCCESS);
|
|
|
|
// drop run count to 0
|
|
gRunCount = 0;
|
|
_mm_mfence();
|
|
#else
|
|
if (pthread_mutex_lock(&gAtomicLock))
|
|
log_error(
|
|
"Atomic operation failed. "
|
|
"pthread_mutex_lock(&gAtomicLock) returned an error\n");
|
|
if (jobError == CL_SUCCESS) jobError = err;
|
|
gRunCount = 0;
|
|
if (pthread_mutex_unlock(&gAtomicLock))
|
|
log_error("Failed to release gAtomicLock. Further atomic "
|
|
"operations may deadlock\n");
|
|
#endif
|
|
}
|
|
}
|
|
|
|
// get the next item
|
|
item = ThreadPool_AtomicAdd(&gRunCount, -1);
|
|
}
|
|
|
|
exit:
|
|
log_info("ThreadPool: thread %d exiting.\n", threadID);
|
|
ThreadPool_AtomicAdd(&gThreadCount, -1);
|
|
#if !defined(_WIN32)
|
|
return NULL;
|
|
#endif
|
|
}
|
|
|
|
// SetThreadCount() may be used to artifically set the number of worker threads
|
|
// If the value is 0 (the default) the number of threads will be determined
|
|
// based on the number of CPU cores. If it is a unicore machine, then 2 will be
|
|
// used, so that we still get some testing for thread safety.
|
|
//
|
|
// If count < 2 or the CL_TEST_SINGLE_THREADED environment variable is set then
|
|
// the code will run single threaded, but will report an error to indicate that
|
|
// the test is invalid. This option is intended for debugging purposes only. It
|
|
// is suggested as a convention that test apps set the thread count to 1 in
|
|
// response to the -m flag.
|
|
//
|
|
// SetThreadCount() must be called before the first call to GetThreadCount() or
|
|
// ThreadPool_Do(), otherwise the behavior is indefined.
|
|
void SetThreadCount(int count)
|
|
{
|
|
if (threadPoolInitErr == CL_SUCCESS)
|
|
{
|
|
log_error("Error: It is illegal to set the thread count after the "
|
|
"first call to ThreadPool_Do or GetThreadCount\n");
|
|
abort();
|
|
}
|
|
|
|
gThreadCount = count;
|
|
}
|
|
|
|
void ThreadPool_Init(void)
|
|
{
|
|
cl_int i;
|
|
int err;
|
|
volatile cl_uint threadID = 0;
|
|
|
|
// Check for manual override of multithreading code. We add this for better
|
|
// debuggability.
|
|
if (getenv("CL_TEST_SINGLE_THREADED"))
|
|
{
|
|
log_error("ERROR: CL_TEST_SINGLE_THREADED is set in the environment. "
|
|
"Running single threaded.\n*** TEST IS INVALID! ***\n");
|
|
gThreadCount = 1;
|
|
return;
|
|
}
|
|
|
|
// Figure out how many threads to run -- check first for non-zero to give
|
|
// the implementation the chance
|
|
if (0 == gThreadCount)
|
|
{
|
|
#if defined(_MSC_VER) || defined(__MINGW64__)
|
|
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
|
|
DWORD length = 0;
|
|
|
|
GetLogicalProcessorInformation(NULL, &length);
|
|
buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(length);
|
|
if (buffer != NULL)
|
|
{
|
|
if (GetLogicalProcessorInformation(buffer, &length) == TRUE)
|
|
{
|
|
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
|
|
while (
|
|
ptr
|
|
< &buffer[length
|
|
/ sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION)])
|
|
{
|
|
if (ptr->Relationship == RelationProcessorCore)
|
|
{
|
|
// Count the number of bits in ProcessorMask (number of
|
|
// logical cores)
|
|
ULONG mask = ptr->ProcessorMask;
|
|
while (mask)
|
|
{
|
|
++gThreadCount;
|
|
mask &= mask - 1; // Remove 1 bit at a time
|
|
}
|
|
}
|
|
++ptr;
|
|
}
|
|
}
|
|
free(buffer);
|
|
}
|
|
#elif defined(__MINGW32__)
|
|
{
|
|
#warning How about this, instead of hard coding it to 2?
|
|
SYSTEM_INFO sysinfo;
|
|
GetSystemInfo(&sysinfo);
|
|
gThreadCount = sysinfo.dwNumberOfProcessors;
|
|
}
|
|
#elif defined(__linux__) && !defined(__ANDROID__)
|
|
cpu_set_t affinity;
|
|
if (0 == sched_getaffinity(0, sizeof(cpu_set_t), &affinity))
|
|
{
|
|
#if !(defined(CPU_COUNT))
|
|
gThreadCount = 1;
|
|
#else
|
|
gThreadCount = CPU_COUNT(&affinity);
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
// Hopefully your system returns logical cpus here, as does MacOS X
|
|
gThreadCount = (cl_int)sysconf(_SC_NPROCESSORS_CONF);
|
|
}
|
|
#else /* !_WIN32 */
|
|
// Hopefully your system returns logical cpus here, as does MacOS X
|
|
gThreadCount = (cl_int)sysconf(_SC_NPROCESSORS_CONF);
|
|
#endif // !_WIN32
|
|
|
|
// Multithreaded tests are required to run multithreaded even on unicore
|
|
// systems so as to test thread safety
|
|
if (1 == gThreadCount) gThreadCount = 2;
|
|
}
|
|
|
|
// When working in 32 bit limit the thread number to 12
|
|
// This fix was made due to memory issues in integer_ops test
|
|
// When running integer_ops, the test opens as many threads as the
|
|
// machine has and each thread allocates a fixed amount of memory
|
|
// When running this test on dual socket machine in 32-bit, the
|
|
// process memory is not sufficient and the test fails
|
|
#if defined(_WIN32) && !defined(_M_X64)
|
|
if (gThreadCount > 12)
|
|
{
|
|
gThreadCount = 12;
|
|
}
|
|
#endif
|
|
|
|
// Allow the app to set thread count to <0 for debugging purposes.
|
|
// This will cause the test to run single threaded.
|
|
if (gThreadCount < 2)
|
|
{
|
|
log_error("ERROR: Running single threaded because thread count < 2. "
|
|
"\n*** TEST IS INVALID! ***\n");
|
|
gThreadCount = 1;
|
|
return;
|
|
}
|
|
|
|
#if defined(_WIN32)
|
|
InitializeCriticalSection(gThreadPoolLock);
|
|
InitializeCriticalSection(cond_lock);
|
|
_InitializeConditionVariable(cond_var);
|
|
caller_event = CreateEvent(NULL, FALSE, FALSE, NULL);
|
|
#elif defined(__GNUC__)
|
|
// Dont rely on PTHREAD_MUTEX_INITIALIZER for intialization of a mutex since
|
|
// it might cause problem with some flavors of gcc compilers.
|
|
pthread_cond_init(&cond_var, NULL);
|
|
pthread_mutex_init(&cond_lock, NULL);
|
|
pthread_cond_init(&caller_cond_var, NULL);
|
|
pthread_mutex_init(&caller_cond_lock, NULL);
|
|
pthread_mutex_init(&gThreadPoolLock, NULL);
|
|
#endif
|
|
|
|
#if !(defined(__GNUC__) || defined(_MSC_VER) || defined(__MINGW32__))
|
|
pthread_mutex_initialize(gAtomicLock);
|
|
#elif defined(__MINGW32__)
|
|
InitializeCriticalSection(&gAtomicLock);
|
|
#endif
|
|
// Make sure the last thread done in the work pool doesn't signal us to wake
|
|
// before we get to the point where we are supposed to wait
|
|
// That would cause a deadlock.
|
|
#if !defined(_WIN32)
|
|
if ((err = pthread_mutex_lock(&caller_cond_lock)))
|
|
{
|
|
log_error("Error %d from pthread_mutex_lock. Unable to block for work "
|
|
"to finish. ThreadPool_Init failed.\n",
|
|
err);
|
|
gThreadCount = 1;
|
|
return;
|
|
}
|
|
#endif // !_WIN32
|
|
|
|
gRunning = gThreadCount;
|
|
// init threads
|
|
for (i = 0; i < gThreadCount; i++)
|
|
{
|
|
#if defined(_WIN32)
|
|
uintptr_t handle =
|
|
_beginthread(ThreadPool_WorkerFunc, 0, (void *)&threadID);
|
|
err = (handle == 0);
|
|
#else // !_WIN32
|
|
pthread_t tid = 0;
|
|
err = pthread_create(&tid, NULL, ThreadPool_WorkerFunc,
|
|
(void *)&threadID);
|
|
#endif // !_WIN32
|
|
if (err)
|
|
{
|
|
log_error("Error %d launching thread %d\n", err, i);
|
|
threadPoolInitErr = err;
|
|
gThreadCount = i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
atexit(ThreadPool_Exit);
|
|
|
|
// block until they are done launching.
|
|
do
|
|
{
|
|
#if defined(_WIN32)
|
|
WaitForSingleObject(caller_event, INFINITE);
|
|
#else // !_WIN32
|
|
if ((err = pthread_cond_wait(&caller_cond_var, &caller_cond_lock)))
|
|
{
|
|
log_error("Error %d from pthread_cond_wait. Unable to block for "
|
|
"work to finish. ThreadPool_Init failed.\n",
|
|
err);
|
|
pthread_mutex_unlock(&caller_cond_lock);
|
|
return;
|
|
}
|
|
#endif // !_WIN32
|
|
} while (gRunCount != -gThreadCount);
|
|
#if !defined(_WIN32)
|
|
if ((err = pthread_mutex_unlock(&caller_cond_lock)))
|
|
{
|
|
log_error("Error %d from pthread_mutex_unlock. Unable to block for "
|
|
"work to finish. ThreadPool_Init failed.\n",
|
|
err);
|
|
return;
|
|
}
|
|
#endif // !_WIN32
|
|
|
|
threadPoolInitErr = CL_SUCCESS;
|
|
}
|
|
|
|
#if defined(_MSC_VER)
|
|
static BOOL CALLBACK _ThreadPool_Init(_PINIT_ONCE InitOnce, PVOID Parameter,
|
|
PVOID *lpContex)
|
|
{
|
|
ThreadPool_Init();
|
|
return TRUE;
|
|
}
|
|
#endif
|
|
|
|
void ThreadPool_Exit(void)
|
|
{
|
|
int err, count;
|
|
gRunCount = CL_INT_MAX;
|
|
|
|
#if defined(__GNUC__)
|
|
// GCC extension:
|
|
// http://gcc.gnu.org/onlinedocs/gcc/Atomic-Builtins.html#Atomic-Builtins
|
|
__sync_synchronize();
|
|
#elif defined(_MSC_VER)
|
|
_mm_mfence();
|
|
#else
|
|
#warning If this is a weakly ordered memory system, please add a memory barrier here to force this and everything else to memory before we proceed
|
|
#endif
|
|
|
|
// spin waiting for threads to die
|
|
for (count = 0; 0 != gThreadCount && count < 1000; count++)
|
|
{
|
|
#if defined(_WIN32)
|
|
_WakeAllConditionVariable(cond_var);
|
|
Sleep(1);
|
|
#else // !_WIN32
|
|
if ((err = pthread_cond_broadcast(&cond_var)))
|
|
{
|
|
log_error("Error %d from pthread_cond_broadcast. Unable to wake up "
|
|
"work threads. ThreadPool_Exit failed.\n",
|
|
err);
|
|
break;
|
|
}
|
|
usleep(1000);
|
|
#endif // !_WIN32
|
|
}
|
|
|
|
if (gThreadCount)
|
|
log_error("Error: Thread pool timed out after 1 second with %d threads "
|
|
"still active.\n",
|
|
gThreadCount);
|
|
else
|
|
log_info("Thread pool exited in a orderly fashion.\n");
|
|
}
|
|
|
|
|
|
// Blocking API that farms out count jobs to a thread pool.
|
|
// It may return with some work undone if func_ptr() returns a non-zero
|
|
// result.
|
|
//
|
|
// This function obviously has its shortcommings. Only one call to ThreadPool_Do
|
|
// can be running at a time. It is not intended for general purpose use.
|
|
// If clEnqueueNativeKernelFn, out of order queues and a CL_DEVICE_TYPE_CPU were
|
|
// all available then it would make more sense to use those features.
|
|
cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo)
|
|
{
|
|
cl_int newErr;
|
|
cl_int err = 0;
|
|
// Lazily set up our threads
|
|
#if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
|
|
err = !_InitOnceExecuteOnce(&threadpool_init_control, _ThreadPool_Init,
|
|
NULL, NULL);
|
|
#elif defined(_WIN32)
|
|
if (threadpool_init_control == 0)
|
|
{
|
|
#warning This is buggy and race prone. Find a better way.
|
|
ThreadPool_Init();
|
|
threadpool_init_control = 1;
|
|
}
|
|
#else // posix platform
|
|
err = pthread_once(&threadpool_init_control, ThreadPool_Init);
|
|
if (err)
|
|
{
|
|
log_error("Error %d from pthread_once. Unable to init threads. "
|
|
"ThreadPool_Do failed.\n",
|
|
err);
|
|
return err;
|
|
}
|
|
#endif
|
|
// Single threaded code to handle case where threadpool wasn't allocated or
|
|
// was disabled by environment variable
|
|
if (threadPoolInitErr)
|
|
{
|
|
cl_uint currentJob = 0;
|
|
cl_int result = CL_SUCCESS;
|
|
|
|
#if defined(__APPLE__) && defined(__arm__)
|
|
// On most platforms which support denorm, default is FTZ off. However,
|
|
// on some hardware where the reference is computed, default might be
|
|
// flush denorms to zero e.g. arm. This creates issues in result
|
|
// verification. Since spec allows the implementation to either flush or
|
|
// not flush denorms to zero, an implementation may choose not be flush
|
|
// i.e. return denorm result whereas reference result may be zero
|
|
// (flushed denorm). Hence we need to disable denorm flushing on host
|
|
// side where reference is being computed to make sure we get
|
|
// non-flushed reference result. If implementation returns flushed
|
|
// result, we correctly take care of that in verification code.
|
|
FPU_mode_type oldMode;
|
|
DisableFTZ(&oldMode);
|
|
#endif
|
|
for (currentJob = 0; currentJob < count; currentJob++)
|
|
if ((result = func_ptr(currentJob, 0, userInfo)))
|
|
{
|
|
#if defined(__APPLE__) && defined(__arm__)
|
|
// Restore FP state before leaving
|
|
RestoreFPState(&oldMode);
|
|
#endif
|
|
return result;
|
|
}
|
|
|
|
#if defined(__APPLE__) && defined(__arm__)
|
|
// Restore FP state before leaving
|
|
RestoreFPState(&oldMode);
|
|
#endif
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
if (count >= MAX_COUNT)
|
|
{
|
|
log_error(
|
|
"Error: ThreadPool_Do count %d >= max threadpool count of %d\n",
|
|
count, MAX_COUNT);
|
|
return -1;
|
|
}
|
|
|
|
// Enter critical region
|
|
#if defined(_WIN32)
|
|
EnterCriticalSection(gThreadPoolLock);
|
|
#else // !_WIN32
|
|
if ((err = pthread_mutex_lock(&gThreadPoolLock)))
|
|
{
|
|
switch (err)
|
|
{
|
|
case EDEADLK:
|
|
log_error(
|
|
"Error EDEADLK returned in ThreadPool_Do(). ThreadPool_Do "
|
|
"is not designed to work recursively!\n");
|
|
break;
|
|
case EINVAL:
|
|
log_error("Error EINVAL returned in ThreadPool_Do(). How did "
|
|
"we end up with an invalid gThreadPoolLock?\n");
|
|
break;
|
|
default: break;
|
|
}
|
|
return err;
|
|
}
|
|
#endif // !_WIN32
|
|
|
|
// Start modifying the job state observable by worker threads
|
|
#if defined(_WIN32)
|
|
EnterCriticalSection(cond_lock);
|
|
#else // !_WIN32
|
|
if ((err = pthread_mutex_lock(&cond_lock)))
|
|
{
|
|
log_error("Error %d from pthread_mutex_lock. Unable to wake up work "
|
|
"threads. ThreadPool_Do failed.\n",
|
|
err);
|
|
goto exit;
|
|
}
|
|
#endif // !_WIN32
|
|
|
|
// Make sure the last thread done in the work pool doesn't signal us to wake
|
|
// before we get to the point where we are supposed to wait
|
|
// That would cause a deadlock.
|
|
#if !defined(_WIN32)
|
|
if ((err = pthread_mutex_lock(&caller_cond_lock)))
|
|
{
|
|
log_error("Error %d from pthread_mutex_lock. Unable to block for work "
|
|
"to finish. ThreadPool_Do failed.\n",
|
|
err);
|
|
goto exit;
|
|
}
|
|
#endif // !_WIN32
|
|
|
|
// Prime the worker threads to get going
|
|
jobError = CL_SUCCESS;
|
|
gRunCount = gJobCount = count;
|
|
gFunc_ptr = func_ptr;
|
|
gUserInfo = userInfo;
|
|
|
|
#if defined(_WIN32)
|
|
ResetEvent(caller_event);
|
|
_WakeAllConditionVariable(cond_var);
|
|
LeaveCriticalSection(cond_lock);
|
|
#else // !_WIN32
|
|
if ((err = pthread_cond_broadcast(&cond_var)))
|
|
{
|
|
log_error("Error %d from pthread_cond_broadcast. Unable to wake up "
|
|
"work threads. ThreadPool_Do failed.\n",
|
|
err);
|
|
goto exit;
|
|
}
|
|
if ((err = pthread_mutex_unlock(&cond_lock)))
|
|
{
|
|
log_error("Error %d from pthread_mutex_unlock. Unable to wake up work "
|
|
"threads. ThreadPool_Do failed.\n",
|
|
err);
|
|
goto exit;
|
|
}
|
|
#endif // !_WIN32
|
|
|
|
// block until they are done. It would be slightly more efficient to do
|
|
// some of the work here though.
|
|
do
|
|
{
|
|
#if defined(_WIN32)
|
|
WaitForSingleObject(caller_event, INFINITE);
|
|
#else // !_WIN32
|
|
if ((err = pthread_cond_wait(&caller_cond_var, &caller_cond_lock)))
|
|
{
|
|
log_error("Error %d from pthread_cond_wait. Unable to block for "
|
|
"work to finish. ThreadPool_Do failed.\n",
|
|
err);
|
|
pthread_mutex_unlock(&caller_cond_lock);
|
|
goto exit;
|
|
}
|
|
#endif // !_WIN32
|
|
} while (gRunning);
|
|
#if !defined(_WIN32)
|
|
if ((err = pthread_mutex_unlock(&caller_cond_lock)))
|
|
{
|
|
log_error("Error %d from pthread_mutex_unlock. Unable to block for "
|
|
"work to finish. ThreadPool_Do failed.\n",
|
|
err);
|
|
goto exit;
|
|
}
|
|
#endif // !_WIN32
|
|
|
|
err = jobError;
|
|
|
|
exit:
|
|
// exit critical region
|
|
#if defined(_WIN32)
|
|
LeaveCriticalSection(gThreadPoolLock);
|
|
#else // !_WIN32
|
|
newErr = pthread_mutex_unlock(&gThreadPoolLock);
|
|
if (newErr)
|
|
{
|
|
log_error("Error %d from pthread_mutex_unlock. Unable to exit critical "
|
|
"region. ThreadPool_Do failed.\n",
|
|
newErr);
|
|
return err;
|
|
}
|
|
#endif // !_WIN32
|
|
|
|
return err;
|
|
}
|
|
|
|
cl_uint GetThreadCount(void)
|
|
{
|
|
// Lazily set up our threads
|
|
#if defined(_MSC_VER) && (_WIN32_WINNT >= 0x600)
|
|
cl_int err = !_InitOnceExecuteOnce(&threadpool_init_control,
|
|
_ThreadPool_Init, NULL, NULL);
|
|
#elif defined(_WIN32)
|
|
if (threadpool_init_control == 0)
|
|
{
|
|
#warning This is buggy and race prone. Find a better way.
|
|
ThreadPool_Init();
|
|
threadpool_init_control = 1;
|
|
}
|
|
#else
|
|
cl_int err = pthread_once(&threadpool_init_control, ThreadPool_Init);
|
|
if (err)
|
|
{
|
|
log_error("Error %d from pthread_once. Unable to init threads. "
|
|
"ThreadPool_Do failed.\n",
|
|
err);
|
|
return err;
|
|
}
|
|
#endif // !_WIN32
|
|
|
|
if (gThreadCount < 1) return 1;
|
|
|
|
return gThreadCount;
|
|
}
|
|
|
|
#else
|
|
|
|
#ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
|
|
#error ThreadPool implementation has not been multithreaded for this operating system. You must multithread this section.
|
|
#endif
|
|
//
|
|
// We require multithreading in parts of the test as a means of simultaneously
|
|
// testing reentrancy requirements of OpenCL API, while also checking
|
|
//
|
|
// A sample single threaded implementation follows, for documentation /
|
|
// bootstrapping purposes. It is not okay to use this for conformance testing!!!
|
|
//
|
|
// Exception: If your operating system does not support multithreaded execution
|
|
// of any kind, then you may use this code.
|
|
//
|
|
|
|
cl_int ThreadPool_AtomicAdd(volatile cl_int *a, cl_int b)
|
|
{
|
|
cl_uint r = *a;
|
|
|
|
// since this fallback code path is not multithreaded, we just do a regular
|
|
// add here. If your operating system supports memory-barrier-atomics, use
|
|
// those here.
|
|
*a = r + b;
|
|
|
|
return r;
|
|
}
|
|
|
|
// Blocking API that farms out count jobs to a thread pool.
|
|
// It may return with some work undone if func_ptr() returns a non-zero
|
|
// result.
|
|
cl_int ThreadPool_Do(TPFuncPtr func_ptr, cl_uint count, void *userInfo)
|
|
{
|
|
cl_uint currentJob = 0;
|
|
cl_int result = CL_SUCCESS;
|
|
|
|
#ifndef MY_OS_REALLY_REALLY_DOESNT_SUPPORT_THREADS
|
|
// THIS FUNCTION IS NOT INTENDED FOR USE!!
|
|
log_error("ERROR: Test must be multithreaded!\n");
|
|
exit(-1);
|
|
#else
|
|
static int spewCount = 0;
|
|
|
|
if (0 == spewCount)
|
|
{
|
|
log_info("\nWARNING: The operating system is claimed not to support "
|
|
"threads of any sort. Running single threaded.\n");
|
|
spewCount = 1;
|
|
}
|
|
#endif
|
|
|
|
// The multithreaded code should mimic this behavior:
|
|
for (currentJob = 0; currentJob < count; currentJob++)
|
|
if ((result = func_ptr(currentJob, 0, userInfo))) return result;
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
cl_uint GetThreadCount(void) { return 1; }
|
|
|
|
void SetThreadCount(int count)
|
|
{
|
|
if (count > 1) log_info("WARNING: SetThreadCount(%d) ignored\n", count);
|
|
}
|
|
|
|
#endif
|