v811_spc009/external/OpenCL-CTS/test_conformance/c11_atomics/common.h

//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef _COMMON_H_
#define _COMMON_H_

#include "harness/testHarness.h"
#include "harness/typeWrappers.h"
#include "harness/ThreadPool.h"

#include "host_atomics.h"

#include <vector>
#include <sstream>

#define MAX_DEVICE_THREADS (gHost ? 0U : gMaxDeviceThreads)
#define MAX_HOST_THREADS GetThreadCount()

#define EXECUTE_TEST(error, test)\
  error |= test;\
  if(error && !gContinueOnError)\
  return error;

enum TExplicitAtomicType
{
    TYPE_ATOMIC_INT,
    TYPE_ATOMIC_UINT,
    TYPE_ATOMIC_LONG,
    TYPE_ATOMIC_ULONG,
    TYPE_ATOMIC_FLOAT,
    TYPE_ATOMIC_DOUBLE,
    TYPE_ATOMIC_INTPTR_T,
    TYPE_ATOMIC_UINTPTR_T,
    TYPE_ATOMIC_SIZE_T,
    TYPE_ATOMIC_PTRDIFF_T,
    TYPE_ATOMIC_FLAG
};

enum TExplicitMemoryScopeType
{
    MEMORY_SCOPE_EMPTY,
    MEMORY_SCOPE_WORK_GROUP,
    MEMORY_SCOPE_DEVICE,
    MEMORY_SCOPE_ALL_DEVICES, // Alias for MEMORY_SCOPE_ALL_SVM_DEVICES
    MEMORY_SCOPE_ALL_SVM_DEVICES
};

extern bool gHost; // temporary flag for testing native host threads (test verification)
extern bool gOldAPI; // temporary flag for testing with old API (OpenCL 1.2)
extern bool gContinueOnError; // execute all cases even when errors detected
extern bool gNoGlobalVariables; // disable cases with global atomics in program scope
extern bool gNoGenericAddressSpace; // disable cases with generic address space
extern bool gUseHostPtr; // use malloc/free instead of clSVMAlloc/clSVMFree
extern bool gDebug; // print OpenCL kernel code
extern int gInternalIterations; // internal test iterations for atomic operation, sufficient to verify atomicity
extern int gMaxDeviceThreads; // maximum number of threads executed on OCL device
extern cl_device_atomic_capabilities gAtomicMemCap,
    gAtomicFenceCap; // atomic memory and fence capabilities for this device

extern const char *get_memory_order_type_name(TExplicitMemoryOrderType orderType);
extern const char *get_memory_scope_type_name(TExplicitMemoryScopeType scopeType);

extern cl_int getSupportedMemoryOrdersAndScopes(
    cl_device_id device, std::vector<TExplicitMemoryOrderType> &memoryOrders,
    std::vector<TExplicitMemoryScopeType> &memoryScopes);

class AtomicTypeInfo
{
public:
  TExplicitAtomicType _type;
  AtomicTypeInfo(TExplicitAtomicType type): _type(type) {}
  cl_uint Size(cl_device_id device);
  const char* AtomicTypeName();
  const char* RegularTypeName();
  const char* AddSubOperandTypeName();
  int IsSupported(cl_device_id device);
};

template<typename HostDataType>
class AtomicTypeExtendedInfo : public AtomicTypeInfo
{
public:
  AtomicTypeExtendedInfo(TExplicitAtomicType type) : AtomicTypeInfo(type) {}
  HostDataType MinValue();
  HostDataType MaxValue();
  HostDataType SpecialValue(cl_uchar x)
  {
    HostDataType tmp;
    cl_uchar *ptr = (cl_uchar*)&tmp;
    for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_uchar); i++)
      ptr[i] = x;
    return tmp;
  }
  HostDataType SpecialValue(cl_ushort x)
  {
    HostDataType tmp;
    cl_ushort *ptr = (cl_ushort*)&tmp;
    for(cl_uint i = 0; i < sizeof(HostDataType)/sizeof(cl_ushort); i++)
      ptr[i] = x;
    return tmp;
  }
};

class CTest  {
public:
  virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements) = 0;
};

template<typename HostAtomicType, typename HostDataType>
class CBasicTest : CTest
{
public:
  typedef struct {
    CBasicTest *test;
    cl_uint tid;
    cl_uint threadCount;
    volatile HostAtomicType *destMemory;
    HostDataType *oldValues;
  } THostThreadContext;
  static cl_int HostThreadFunction(cl_uint job_id, cl_uint thread_id, void *userInfo)
  {
    THostThreadContext *threadContext = ((THostThreadContext*)userInfo)+job_id;
    threadContext->test->HostFunction(threadContext->tid, threadContext->threadCount, threadContext->destMemory, threadContext->oldValues);
    return 0;
  }
  CBasicTest(TExplicitAtomicType dataType, bool useSVM) : CTest(),
    _maxDeviceThreads(MAX_DEVICE_THREADS),
    _dataType(dataType), _useSVM(useSVM), _startValue(255),
    _localMemory(false), _declaredInProgram(false),
    _usedInFunction(false), _genericAddrSpace(false),
    _oldValueCheck(true), _localRefValues(false),
    _maxGroupSize(0), _passCount(0), _iterations(gInternalIterations)
  {
  }
  virtual ~CBasicTest()
  {
    if(_passCount)
      log_info("  %u tests executed successfully for %s\n", _passCount, DataType().AtomicTypeName());
  }
  virtual cl_uint NumResults(cl_uint threadCount, cl_device_id deviceID)
  {
    return 1;
  }
  virtual cl_uint NumNonAtomicVariablesPerThread()
  {
    return 1;
  }
  virtual bool ExpectedValue(HostDataType &expected, cl_uint threadCount, HostDataType *startRefValues, cl_uint whichDestValue)
  {
    return false;
  }
  virtual bool GenerateRefs(cl_uint threadCount, HostDataType *startRefValues, MTdata d)
  {
    return false;
  }
  virtual bool VerifyRefs(bool &correct, cl_uint threadCount, HostDataType *refValues, HostAtomicType *finalValues)
  {
    return false;
  }
  virtual std::string PragmaHeader(cl_device_id deviceID);
  virtual std::string ProgramHeader(cl_uint maxNumDestItems);
  virtual std::string FunctionCode();
  virtual std::string KernelCode(cl_uint maxNumDestItems);
  virtual std::string ProgramCore() = 0;
  virtual std::string SingleTestName()
  {
    std::string testName = LocalMemory() ? "local" : "global";
    testName += " ";
    testName += DataType().AtomicTypeName();
    if(DeclaredInProgram())
    {
      testName += " declared in program";
    }
    if(DeclaredInProgram() && UsedInFunction())
      testName += ",";
    if(UsedInFunction())
    {
      testName += " used in ";
      if(GenericAddrSpace())
        testName += "generic ";
      testName += "function";
    }
    return testName;
  }
  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue);
  int ExecuteForEachPointerType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
  {
    int error = 0;
    UsedInFunction(false);
    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
    UsedInFunction(true);
    GenericAddrSpace(false);
    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
    GenericAddrSpace(true);
    EXECUTE_TEST(error, ExecuteSingleTest(deviceID, context, queue));
    GenericAddrSpace(false);
    return error;
  }
  int ExecuteForEachDeclarationType(cl_device_id deviceID, cl_context context, cl_command_queue queue)
  {
    int error = 0;
    DeclaredInProgram(false);
    EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
    if(!UseSVM())
    {
      DeclaredInProgram(true);
      EXECUTE_TEST(error, ExecuteForEachPointerType(deviceID, context, queue));
    }
    return error;
  }
  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
  {
    int error = 0;
    if(_maxDeviceThreads > 0 && !UseSVM())
    {
      LocalMemory(true);
      EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
    }
    if(_maxDeviceThreads+MaxHostThreads() > 0)
    {
      LocalMemory(false);
      EXECUTE_TEST(error, ExecuteForEachDeclarationType(deviceID, context, queue));
    }
    return error;
  }
  virtual int Execute(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements)
  {
    if(sizeof(HostAtomicType) != DataType().Size(deviceID))
    {
      log_info("Invalid test: Host atomic type size (%u) is different than OpenCL type size (%u)\n", (cl_uint)sizeof(HostAtomicType), DataType().Size(deviceID));
      return -1;
    }
    if(sizeof(HostAtomicType) != sizeof(HostDataType))
    {
      log_info("Invalid test: Host atomic type size (%u) is different than corresponding type size (%u)\n", (cl_uint)sizeof(HostAtomicType), (cl_uint)sizeof(HostDataType));
      return -1;
    }
    // Verify we can run first
    if(UseSVM() && !gUseHostPtr)
    {
      cl_device_svm_capabilities caps;
      cl_int error = clGetDeviceInfo(deviceID, CL_DEVICE_SVM_CAPABILITIES, sizeof(caps), &caps, 0);
      test_error(error, "clGetDeviceInfo failed");
      if((caps & CL_DEVICE_SVM_ATOMICS) == 0)
      {
        log_info("\t%s - SVM_ATOMICS not supported\n", DataType().AtomicTypeName());
        // implicit pass
        return 0;
      }
    }
    if(!DataType().IsSupported(deviceID))
    {
      log_info("\t%s not supported\n", DataType().AtomicTypeName());
      // implicit pass or host test (debug feature)
      if(UseSVM())
        return 0;
      _maxDeviceThreads = 0;
    }
    if(_maxDeviceThreads+MaxHostThreads() == 0)
      return 0;
    return ExecuteForEachParameterSet(deviceID, context, queue);
  }
  virtual void HostFunction(cl_uint tid, cl_uint threadCount, volatile HostAtomicType *destMemory, HostDataType *oldValues)
  {
    log_info("Empty thread function %u\n", (cl_uint)tid);
  }
  AtomicTypeExtendedInfo<HostDataType> DataType() const
  {
    return AtomicTypeExtendedInfo<HostDataType>(_dataType);
  }
  cl_uint _maxDeviceThreads;
  virtual cl_uint MaxHostThreads()
  {
    if(UseSVM() || gHost)
      return MAX_HOST_THREADS;
    else
      return 0;
  }

  int CheckCapabilities(TExplicitMemoryScopeType memoryScope,
                        TExplicitMemoryOrderType memoryOrder)
  {
      /*
          Differentiation between atomic fence and other atomic operations
          does not need to occur here.

          The initialisation of this test checks that the minimum required
          capabilities are supported by this device.

          The following switches allow the test to skip if optional capabilites
          are not supported by the device.
        */
      switch (memoryScope)
      {
          case MEMORY_SCOPE_EMPTY: {
              break;
          }
          case MEMORY_SCOPE_WORK_GROUP: {
              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP) == 0)
              {
                  return TEST_SKIPPED_ITSELF;
              }
              break;
          }
          case MEMORY_SCOPE_DEVICE: {
              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_DEVICE) == 0)
              {
                  return TEST_SKIPPED_ITSELF;
              }
              break;
          }
          case MEMORY_SCOPE_ALL_DEVICES: // fallthough
          case MEMORY_SCOPE_ALL_SVM_DEVICES: {
              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES) == 0)
              {
                  return TEST_SKIPPED_ITSELF;
              }
              break;
          }
          default: {
              log_info("Invalid memory scope\n");
              break;
          }
      }

      switch (memoryOrder)
      {
          case MEMORY_ORDER_EMPTY: {
              break;
          }
          case MEMORY_ORDER_RELAXED: {
              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_RELAXED) == 0)
              {
                  return TEST_SKIPPED_ITSELF;
              }
              break;
          }
          case MEMORY_ORDER_ACQUIRE:
          case MEMORY_ORDER_RELEASE:
          case MEMORY_ORDER_ACQ_REL: {
              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_ACQ_REL) == 0)
              {
                  return TEST_SKIPPED_ITSELF;
              }
              break;
          }
          case MEMORY_ORDER_SEQ_CST: {
              if ((gAtomicMemCap & CL_DEVICE_ATOMIC_ORDER_SEQ_CST) == 0)
              {
                  return TEST_SKIPPED_ITSELF;
              }
              break;
          }
          default: {
              log_info("Invalid memory order\n");
              break;
          }
      }

      return 0;
  }
  virtual bool SVMDataBufferAllSVMConsistent() {return false;}
  bool UseSVM() {return _useSVM;}
  void StartValue(HostDataType startValue) {_startValue = startValue;}
  HostDataType StartValue() {return _startValue;}
  void LocalMemory(bool local) {_localMemory = local;}
  bool LocalMemory() {return _localMemory;}
  void DeclaredInProgram(bool declaredInProgram) {_declaredInProgram = declaredInProgram;}
  bool DeclaredInProgram() {return _declaredInProgram;}
  void UsedInFunction(bool local) {_usedInFunction = local;}
  bool UsedInFunction() {return _usedInFunction;}
  void GenericAddrSpace(bool genericAddrSpace) {_genericAddrSpace = genericAddrSpace;}
  bool GenericAddrSpace() {return _genericAddrSpace;}
  void OldValueCheck(bool check) {_oldValueCheck = check;}
  bool OldValueCheck() {return _oldValueCheck;}
  void LocalRefValues(bool localRefValues) {_localRefValues = localRefValues;}
  bool LocalRefValues() {return _localRefValues;}
  void MaxGroupSize(cl_uint maxGroupSize) {_maxGroupSize = maxGroupSize;}
  cl_uint MaxGroupSize() {return _maxGroupSize;}
  void CurrentGroupSize(cl_uint currentGroupSize)
  {
    if(MaxGroupSize() && MaxGroupSize() < currentGroupSize)
      _currentGroupSize = MaxGroupSize();
    else
      _currentGroupSize = currentGroupSize;
  }
  cl_uint CurrentGroupSize() {return _currentGroupSize;}
  virtual cl_uint CurrentGroupNum(cl_uint threadCount)
  {
    if(threadCount == 0)
      return 0;
    if(LocalMemory())
      return 1;
    return threadCount/CurrentGroupSize();
  }
  cl_int Iterations() {return _iterations;}
  std::string IterationsStr() {std::stringstream ss; ss << _iterations; return ss.str();}
private:
  const TExplicitAtomicType _dataType;
  const bool _useSVM;
  HostDataType	_startValue;
  bool _localMemory;
  bool _declaredInProgram;
  bool _usedInFunction;
  bool _genericAddrSpace;
  bool _oldValueCheck;
  bool _localRefValues;
  cl_uint _maxGroupSize;
  cl_uint _currentGroupSize;
  cl_uint _passCount;
  const cl_int _iterations;
};

template<typename HostAtomicType, typename HostDataType>
class CBasicTestMemOrderScope : public CBasicTest<HostAtomicType, HostDataType>
{
public:
  using CBasicTest<HostAtomicType, HostDataType>::LocalMemory;
  using CBasicTest<HostAtomicType, HostDataType>::MaxGroupSize;
  using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;
  CBasicTestMemOrderScope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTest<HostAtomicType, HostDataType>(dataType, useSVM)
  {
  }
  virtual std::string ProgramHeader(cl_uint maxNumDestItems)
  {
    std::string header;
    if(gOldAPI)
    {
      std::string s = MemoryScope() == MEMORY_SCOPE_EMPTY ? "" : ",s";
      header +=
        "#define atomic_store_explicit(x,y,o"+s+")                     atomic_store(x,y)\n"
        "#define atomic_load_explicit(x,o"+s+")                        atomic_load(x)\n"
        "#define atomic_exchange_explicit(x,y,o"+s+")                  atomic_exchange(x,y)\n"
        "#define atomic_compare_exchange_strong_explicit(x,y,z,os,of"+s+") atomic_compare_exchange_strong(x,y,z)\n"
        "#define atomic_compare_exchange_weak_explicit(x,y,z,os,of"+s+")   atomic_compare_exchange_weak(x,y,z)\n"
        "#define atomic_fetch_add_explicit(x,y,o"+s+")                 atomic_fetch_add(x,y)\n"
        "#define atomic_fetch_sub_explicit(x,y,o"+s+")                 atomic_fetch_sub(x,y)\n"
        "#define atomic_fetch_or_explicit(x,y,o"+s+")                  atomic_fetch_or(x,y)\n"
        "#define atomic_fetch_xor_explicit(x,y,o"+s+")                 atomic_fetch_xor(x,y)\n"
        "#define atomic_fetch_and_explicit(x,y,o"+s+")                 atomic_fetch_and(x,y)\n"
        "#define atomic_fetch_min_explicit(x,y,o"+s+")                 atomic_fetch_min(x,y)\n"
        "#define atomic_fetch_max_explicit(x,y,o"+s+")                 atomic_fetch_max(x,y)\n"
        "#define atomic_flag_test_and_set_explicit(x,o"+s+")           atomic_flag_test_and_set(x)\n"
        "#define atomic_flag_clear_explicit(x,o"+s+")                  atomic_flag_clear(x)\n";
    }
    return header+CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(maxNumDestItems);
  }
  virtual std::string SingleTestName()
  {
    std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
    {
      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
    }
    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
    {
      testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
    }
    return testName;
  }
  virtual int ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
  {
    if(LocalMemory() &&
      MemoryScope() != MEMORY_SCOPE_EMPTY &&
      MemoryScope() != MEMORY_SCOPE_WORK_GROUP) //memory scope should only be used for global memory
      return 0;
    if(MemoryScope() == MEMORY_SCOPE_DEVICE)
      MaxGroupSize(16); // increase number of groups by forcing smaller group size
    else
      MaxGroupSize(0); // group size limited by device capabilities

    if (CheckCapabilities(MemoryScope(), MemoryOrder()) == TEST_SKIPPED_ITSELF)
        return 0; // skip test - not applicable

    return CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(deviceID, context, queue);
  }
  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
  {
    // repeat test for each reasonable memory order/scope combination
    std::vector<TExplicitMemoryOrderType> memoryOrder;
    std::vector<TExplicitMemoryScopeType> memoryScope;
    int error = 0;

    // For OpenCL-3.0 and later some orderings and scopes are optional, so here
    // we query for the supported ones.
    test_error_ret(
        getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope),
        "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);

    for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
    {
      for(unsigned si = 0; si < memoryScope.size(); si++)
      {
        if(memoryOrder[oi] == MEMORY_ORDER_EMPTY && memoryScope[si] != MEMORY_SCOPE_EMPTY)
          continue;
        MemoryOrder(memoryOrder[oi]);
        MemoryScope(memoryScope[si]);
        EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
      }
    }
    return error;
  }
  void MemoryOrder(TExplicitMemoryOrderType memoryOrder) {_memoryOrder = memoryOrder;}
  TExplicitMemoryOrderType MemoryOrder() {return _memoryOrder;}
  std::string MemoryOrderStr()
  {
    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
      return std::string(", ")+get_memory_order_type_name(MemoryOrder());
    return "";
  }
  void MemoryScope(TExplicitMemoryScopeType memoryScope) {_memoryScope = memoryScope;}
  TExplicitMemoryScopeType MemoryScope() {return _memoryScope;}
  std::string MemoryScopeStr()
  {
    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
      return std::string(", ")+get_memory_scope_type_name(MemoryScope());
    return "";
  }
  std::string MemoryOrderScopeStr()
  {
    return MemoryOrderStr()+MemoryScopeStr();
  }
  virtual cl_uint CurrentGroupNum(cl_uint threadCount)
  {
    if(MemoryScope() == MEMORY_SCOPE_WORK_GROUP)
      return 1;
    return CBasicTest<HostAtomicType, HostDataType>::CurrentGroupNum(threadCount);
  }
  virtual cl_uint MaxHostThreads()
  {
      // block host threads execution for memory scope different than
      // memory_scope_all_svm_devices
      if (MemoryScope() == MEMORY_SCOPE_ALL_DEVICES
          || MemoryScope() == MEMORY_SCOPE_ALL_SVM_DEVICES || gHost)
      {
          return CBasicTest<HostAtomicType, HostDataType>::MaxHostThreads();
      }
      else
      {
          return 0;
      }
  }
private:
  TExplicitMemoryOrderType _memoryOrder;
  TExplicitMemoryScopeType _memoryScope;
};

template<typename HostAtomicType, typename HostDataType>
class CBasicTestMemOrder2Scope : public CBasicTestMemOrderScope<HostAtomicType, HostDataType>
{
public:
  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::LocalMemory;
  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrder;
  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScope;
  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryOrderStr;
  using CBasicTestMemOrderScope<HostAtomicType, HostDataType>::MemoryScopeStr;
  using CBasicTest<HostAtomicType, HostDataType>::CheckCapabilities;

  CBasicTestMemOrder2Scope(TExplicitAtomicType dataType, bool useSVM = false) : CBasicTestMemOrderScope<HostAtomicType, HostDataType>(dataType, useSVM)
  {
  }
  virtual std::string SingleTestName()
  {
    std::string testName = CBasicTest<HostAtomicType, HostDataType>::SingleTestName();
    if(MemoryOrder() != MEMORY_ORDER_EMPTY)
      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder())).substr(sizeof("memory"));
    if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
      testName += std::string(", ")+std::string(get_memory_order_type_name(MemoryOrder2())).substr(sizeof("memory"));
    if(MemoryScope() != MEMORY_SCOPE_EMPTY)
      testName += std::string(", ")+std::string(get_memory_scope_type_name(MemoryScope())).substr(sizeof("memory"));
    return testName;
  }
  virtual int ExecuteForEachParameterSet(cl_device_id deviceID, cl_context context, cl_command_queue queue)
  {
    // repeat test for each reasonable memory order/scope combination
    std::vector<TExplicitMemoryOrderType> memoryOrder;
    std::vector<TExplicitMemoryScopeType> memoryScope;
    int error = 0;

    // For OpenCL-3.0 and later some orderings and scopes are optional, so here
    // we query for the supported ones.
    test_error_ret(
        getSupportedMemoryOrdersAndScopes(deviceID, memoryOrder, memoryScope),
        "getSupportedMemoryOrdersAndScopes failed\n", TEST_FAIL);

    for(unsigned oi = 0; oi < memoryOrder.size(); oi++)
    {
      for(unsigned o2i = 0; o2i < memoryOrder.size(); o2i++)
      {
        for(unsigned si = 0; si < memoryScope.size(); si++)
        {
          if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
            && memoryOrder[oi] != memoryOrder[o2i])
            continue; // both memory order arguments must be set (or none)
          if((memoryOrder[oi] == MEMORY_ORDER_EMPTY || memoryOrder[o2i] == MEMORY_ORDER_EMPTY)
            && memoryScope[si] != MEMORY_SCOPE_EMPTY)
            continue; // memory scope without memory order is not allowed
          MemoryOrder(memoryOrder[oi]);
          MemoryOrder2(memoryOrder[o2i]);
          MemoryScope(memoryScope[si]);

          if (CheckCapabilities(MemoryScope(), MemoryOrder())
              == TEST_SKIPPED_ITSELF)
              continue; // skip test - not applicable

          if (CheckCapabilities(MemoryScope(), MemoryOrder2())
              == TEST_SKIPPED_ITSELF)
              continue; // skip test - not applicable

          EXECUTE_TEST(error, (CBasicTest<HostAtomicType, HostDataType>::ExecuteForEachParameterSet(deviceID, context, queue)));
        }
      }
    }
    return error;
  }
  void MemoryOrder2(TExplicitMemoryOrderType memoryOrderFail) {_memoryOrder2 = memoryOrderFail;}
  TExplicitMemoryOrderType MemoryOrder2() {return _memoryOrder2;}
  std::string MemoryOrderFailStr()
  {
    if(MemoryOrder2() != MEMORY_ORDER_EMPTY)
      return std::string(", ")+get_memory_order_type_name(MemoryOrder2());
    return "";
  }
  std::string MemoryOrderScope()
  {
    return MemoryOrderStr()+MemoryOrderFailStr()+MemoryScopeStr();
  }
private:
  TExplicitMemoryOrderType _memoryOrder2;
};

template<typename HostAtomicType, typename HostDataType>
std::string CBasicTest<HostAtomicType, HostDataType>::PragmaHeader(cl_device_id deviceID)
{
  std::string pragma;

  if(gOldAPI)
  {
    pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n";
    pragma += "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n";
    pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n";
    pragma += "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n";
  }
  // Create the pragma lines for this kernel
  if(DataType().Size(deviceID) == 8)
  {
    pragma += "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
    pragma += "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n";
  }
  if(_dataType == TYPE_ATOMIC_DOUBLE)
    pragma += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
  return pragma;
}

template<typename HostAtomicType, typename HostDataType>
std::string CBasicTest<HostAtomicType, HostDataType>::ProgramHeader(cl_uint maxNumDestItems)
{
  // Create the program header
  std::string header;
  std::string aTypeName = DataType().AtomicTypeName();
  std::string cTypeName = DataType().RegularTypeName();
  std::string argListForKernel;
  std::string argListForFunction;
  std::string argListNoTypes;
  std::string functionPrototype;
  std::string addressSpace = LocalMemory() ? "__local " : "__global ";

  if(gOldAPI)
  {
    header += std::string("#define ")+aTypeName+" "+cTypeName+"\n"
      "#define atomic_store(x,y)                                (*(x) = y)\n"
      "#define atomic_load(x)                                   (*(x))\n"
      "#define ATOMIC_VAR_INIT(x)                               (x)\n"
      "#define ATOMIC_FLAG_INIT                                 0\n"
      "#define atomic_init(x,y)                                 atomic_store(x,y)\n";
    if(aTypeName == "atomic_float")
      header += "#define atomic_exchange(x,y)                             atomic_xchg(x,y)\n";
    else if(aTypeName == "atomic_double")
      header += "double atomic_exchange(volatile "+addressSpace+"atomic_double *x, double y)\n"
        "{\n"
        "  long tmp = *(long*)&y, res;\n"
        "  volatile "+addressSpace+"long *tmpA = (volatile "+addressSpace+"long)x;\n"
        "  res = atom_xchg(tmpA,tmp);\n"
        "  return *(double*)&res;\n"
        "}\n";
    else
      header += "#define atomic_exchange(x,y)                             atom_xchg(x,y)\n";
    if(aTypeName != "atomic_float" && aTypeName != "atomic_double")
      header +=
      "bool atomic_compare_exchange_strong(volatile "+addressSpace+" "+aTypeName+" *a, "+cTypeName+" *expected, "+cTypeName+" desired)\n"
      "{\n"
      "  "+cTypeName+" old = atom_cmpxchg(a, *expected, desired);\n"
      "  if(old == *expected)\n"
      "    return true;\n"
      "  *expected = old;\n"
      "  return false;\n"
      "}\n"
      "#define atomic_compare_exchange_weak                     atomic_compare_exchange_strong\n";
    header +=
      "#define atomic_fetch_add(x,y)                            atom_add(x,y)\n"
      "#define atomic_fetch_sub(x,y)                            atom_sub(x,y)\n"
      "#define atomic_fetch_or(x,y)                             atom_or(x,y)\n"
      "#define atomic_fetch_xor(x,y)                            atom_xor(x,y)\n"
      "#define atomic_fetch_and(x,y)                            atom_and(x,y)\n"
      "#define atomic_fetch_min(x,y)                            atom_min(x,y)\n"
      "#define atomic_fetch_max(x,y)                            atom_max(x,y)\n"
      "#define atomic_flag_test_and_set(x)                      atomic_exchange(x,1)\n"
      "#define atomic_flag_clear(x)                             atomic_store(x,0)\n"
      "\n";
  }
  if(!LocalMemory() && DeclaredInProgram())
  {
    // additional atomic variable for results copying (last thread will do this)
    header += "__global volatile atomic_uint finishedThreads = ATOMIC_VAR_INIT(0);\n";
    // atomic variables declared in program scope - test data
    std::stringstream ss;
    ss << maxNumDestItems;
    header += std::string("__global volatile ")+aTypeName+" destMemory["+ss.str()+"] = {\n";
    ss.str("");
    ss << _startValue;
    for(cl_uint i = 0; i < maxNumDestItems; i++)
    {
      if(aTypeName == "atomic_flag")
        header +=  "  ATOMIC_FLAG_INIT";
      else
        header +=  "  ATOMIC_VAR_INIT("+ss.str()+")";
      if(i+1 < maxNumDestItems)
        header += ",";
      header += "\n";
    }
    header+=
      "};\n"
      "\n";
  }
  return header;
}

template<typename HostAtomicType, typename HostDataType>
std::string CBasicTest<HostAtomicType, HostDataType>::FunctionCode()
{
  if(!UsedInFunction())
    return "";
  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
  std::string code = "void test_atomic_function(uint tid, uint threadCount, uint numDestItems, volatile ";
  if(!GenericAddrSpace())
    code += addressSpace;
  code += std::string(DataType().AtomicTypeName())+" *destMemory, __global "+DataType().RegularTypeName()+
    " *oldValues";
  if(LocalRefValues())
    code += std::string(", __local ")+DataType().RegularTypeName()+" *localValues";
  code += ")\n"
    "{\n";
  code += ProgramCore();
  code += "}\n"
    "\n";
  return code;
}

template<typename HostAtomicType, typename HostDataType>
std::string CBasicTest<HostAtomicType, HostDataType>::KernelCode(cl_uint maxNumDestItems)
{
  std::string aTypeName = DataType().AtomicTypeName();
  std::string cTypeName = DataType().RegularTypeName();
  std::string addressSpace = LocalMemory() ? "__local " : "__global ";
  std::string code = "__kernel void test_atomic_kernel(uint threadCount, uint numDestItems, ";

  // prepare list of arguments for kernel
  if(LocalMemory())
  {
    code += std::string("__global ")+cTypeName+" *finalDest, __global "+cTypeName+" *oldValues,"
      " volatile "+addressSpace+aTypeName+" *"+(DeclaredInProgram() ? "notUsed" : "")+"destMemory";
  }
  else
  {
    code += "volatile "+addressSpace+(DeclaredInProgram() ? (cTypeName+" *finalDest") : (aTypeName+" *destMemory"))+
      ", __global "+cTypeName+" *oldValues";
  }
  if(LocalRefValues())
    code += std::string(", __local ")+cTypeName+" *localValues";
  code += ")\n"
    "{\n";
  if(LocalMemory() && DeclaredInProgram())
  {
    // local atomics declared in kernel scope
    std::stringstream ss;
    ss << maxNumDestItems;
    code += std::string("  __local volatile ")+aTypeName+" destMemory["+ss.str()+"];\n";
  }
  code += "  uint  tid = get_global_id(0);\n"
    "\n";
  if(LocalMemory())
  {
      // memory_order_relaxed is sufficient for these initialization operations
      // as the barrier below will act as a fence, providing an order to the
      // operations. memory_scope_work_group is sufficient as local memory is
      // only visible within the work-group.
      code += R"(
              // initialize atomics not reachable from host (first thread
              // is doing this, other threads are waiting on barrier)
              if(get_local_id(0) == 0)
                for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)
                {)";
      if (aTypeName == "atomic_flag")
      {
          code += R"(
                  if(finalDest[dstItemIdx])
                    atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
                                                      memory_order_relaxed,
                                                      memory_scope_work_group);
                  else
                    atomic_flag_clear_explicit(destMemory+dstItemIdx,
                                               memory_order_relaxed,
                                               memory_scope_work_group);)";
      }
    else
    {
        code += R"(
                atomic_store_explicit(destMemory+dstItemIdx,
                                      finalDest[dstItemIdx],
                                      memory_order_relaxed,
                                      memory_scope_work_group);)";
    }
    code +=
      "    }\n"
      "  barrier(CLK_LOCAL_MEM_FENCE);\n"
      "\n";
  }
  if (LocalRefValues())
  {
    code +=
      "  // Copy input reference values into local memory\n";
    if (NumNonAtomicVariablesPerThread() == 1)
      code += "  localValues[get_local_id(0)] = oldValues[tid];\n";
    else
    {
      std::stringstream ss;
      ss << NumNonAtomicVariablesPerThread();
      code +=
        "  for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
        "    localValues[get_local_id(0)*" + ss.str() + "+rfId] = oldValues[tid*" + ss.str() + "+rfId];\n";
    }
    code +=
      "  barrier(CLK_LOCAL_MEM_FENCE);\n"
      "\n";
  }
  if (UsedInFunction())
    code += std::string("  test_atomic_function(tid, threadCount, numDestItems, destMemory, oldValues")+
    (LocalRefValues() ? ", localValues" : "")+");\n";
  else
    code += ProgramCore();
  code += "\n";
  if (LocalRefValues())
  {
    code +=
      "  // Copy local reference values into output array\n"
      "  barrier(CLK_LOCAL_MEM_FENCE);\n";
    if (NumNonAtomicVariablesPerThread() == 1)
      code += "  oldValues[tid] = localValues[get_local_id(0)];\n";
    else
    {
      std::stringstream ss;
      ss << NumNonAtomicVariablesPerThread();
      code +=
        "  for(uint rfId = 0; rfId < " + ss.str() + "; rfId++)\n"
        "    oldValues[tid*" + ss.str() + "+rfId] = localValues[get_local_id(0)*" + ss.str() + "+rfId];\n";
    }
    code += "\n";
  }
  if(LocalMemory() || DeclaredInProgram())
  {
    code += "  // Copy final values to host reachable buffer\n";
    if(LocalMemory())
      code +=
        "  barrier(CLK_LOCAL_MEM_FENCE);\n"
        "  if(get_local_id(0) == 0) // first thread in workgroup\n";
    else
      // global atomics declared in program scope
      code += R"(
                if(atomic_fetch_add_explicit(&finishedThreads, 1u,
                                           memory_order_relaxed,
                                           memory_scope_work_group)
                   == get_global_size(0)-1) // last finished thread
                   )";
    code +=
        "    for(uint dstItemIdx = 0; dstItemIdx < numDestItems; dstItemIdx++)\n";
    if(aTypeName == "atomic_flag")
    {
        code += R"(
                finalDest[dstItemIdx] =
                    atomic_flag_test_and_set_explicit(destMemory+dstItemIdx,
                                                      memory_order_relaxed,
                                                      memory_scope_work_group);)";
    }
    else
    {
        code += R"(
                finalDest[dstItemIdx] =
                    atomic_load_explicit(destMemory+dstItemIdx,
                                         memory_order_relaxed,
                                         memory_scope_work_group);)";
    }
  }
  code += "}\n"
    "\n";
  return code;
}

template <typename HostAtomicType, typename HostDataType>
int CBasicTest<HostAtomicType, HostDataType>::ExecuteSingleTest(cl_device_id deviceID, cl_context context, cl_command_queue queue)
{
  int error;
  clProgramWrapper program;
  clKernelWrapper kernel;
  size_t threadNum[1];
  clMemWrapper streams[2];
  std::vector<HostAtomicType> destItems;
  HostAtomicType *svmAtomicBuffer = 0;
  std::vector<HostDataType> refValues, startRefValues;
  HostDataType *svmDataBuffer = 0;
  cl_uint deviceThreadCount, hostThreadCount, threadCount;
  size_t groupSize = 0;
  std::string programSource;
  const char *programLine;
  MTdata d;
  size_t typeSize = DataType().Size(deviceID);

  deviceThreadCount = _maxDeviceThreads;
  hostThreadCount = MaxHostThreads();
  threadCount = deviceThreadCount+hostThreadCount;

  //log_info("\t%s %s%s...\n", local ? "local" : "global", DataType().AtomicTypeName(), memoryOrderScope.c_str());
  log_info("\t%s...\n", SingleTestName().c_str());

  if(!LocalMemory() && DeclaredInProgram() && gNoGlobalVariables) // no support for program scope global variables
  {
    log_info("\t\tTest disabled\n");
    return 0;
  }
  if(UsedInFunction() && GenericAddrSpace() && gNoGenericAddressSpace)
  {
    log_info("\t\tTest disabled\n");
    return 0;
  }

  // set up work sizes based on device capabilities and test configuration
  error = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(groupSize), &groupSize, NULL);
  test_error(error, "Unable to obtain max work group size for device");
  CurrentGroupSize((cl_uint)groupSize);
  if(CurrentGroupSize() > deviceThreadCount)
    CurrentGroupSize(deviceThreadCount);
  if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
    deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
  threadCount = deviceThreadCount+hostThreadCount;

  // If we're given a num_results function, we need to determine how many result objects we need.
  // This is the first assessment for current maximum number of threads (exact thread count is not known here)
  // - needed for program source code generation (arrays of atomics declared in program)
  cl_uint numDestItems = NumResults(threadCount, deviceID);

  if(deviceThreadCount > 0)
  {
      // This loop iteratively reduces the workgroup size by 2 and then
      // re-generates the kernel with the reduced
      // workgroup size until we find a size which is admissible for the kernel
      // being run or reduce the wg size
      // to the trivial case of 1 (which was separately verified to be accurate
      // for the kernel being run)

      while ((CurrentGroupSize() > 1))
      {
          // Re-generate the kernel code with the current group size
          if (kernel) clReleaseKernel(kernel);
          if (program) clReleaseProgram(program);
          programSource = PragmaHeader(deviceID) + ProgramHeader(numDestItems)
              + FunctionCode() + KernelCode(numDestItems);
          programLine = programSource.c_str();
          if (create_single_kernel_helper_with_build_options(
                  context, &program, &kernel, 1, &programLine,
                  "test_atomic_kernel", gOldAPI ? "" : nullptr))
          {
              return -1;
          }
          // Get work group size for the new kernel
          error = clGetKernelWorkGroupInfo(kernel, deviceID,
                                           CL_KERNEL_WORK_GROUP_SIZE,
                                           sizeof(groupSize), &groupSize, NULL);
          test_error(error,
                     "Unable to obtain max work group size for device and "
                     "kernel combo");

          if (LocalMemory())
          {
              cl_ulong usedLocalMemory;
              cl_ulong totalLocalMemory;
              cl_uint maxWorkGroupSize;

              error = clGetKernelWorkGroupInfo(
                  kernel, deviceID, CL_KERNEL_LOCAL_MEM_SIZE,
                  sizeof(usedLocalMemory), &usedLocalMemory, NULL);
              test_error(error, "clGetKernelWorkGroupInfo failed");

              error = clGetDeviceInfo(deviceID, CL_DEVICE_LOCAL_MEM_SIZE,
                                      sizeof(totalLocalMemory),
                                      &totalLocalMemory, NULL);
              test_error(error, "clGetDeviceInfo failed");

              // We know that each work-group is going to use typeSize *
              // deviceThreadCount bytes of local memory
              // so pick the maximum value for deviceThreadCount that uses all
              // the local memory.
              maxWorkGroupSize =
                  ((totalLocalMemory - usedLocalMemory) / typeSize);

              if (maxWorkGroupSize < groupSize) groupSize = maxWorkGroupSize;
          }
          if (CurrentGroupSize() <= groupSize)
              break;
          else
              CurrentGroupSize(CurrentGroupSize() / 2);
      }
    if(CurrentGroupSize() > deviceThreadCount)
      CurrentGroupSize(deviceThreadCount);
    if(CurrentGroupNum(deviceThreadCount) == 1 || gOldAPI)
      deviceThreadCount = CurrentGroupSize()*CurrentGroupNum(deviceThreadCount);
    threadCount = deviceThreadCount+hostThreadCount;
  }
  if (gDebug)
  {
      log_info("Program source:\n");
      log_info("%s\n", programLine);
  }
  if(deviceThreadCount > 0)
    log_info("\t\t(thread count %u, group size %u)\n", deviceThreadCount, CurrentGroupSize());
  if(hostThreadCount > 0)
    log_info("\t\t(host threads %u)\n", hostThreadCount);

  refValues.resize(threadCount*NumNonAtomicVariablesPerThread());

  // Generate ref data if we have a ref generator provided
  d = init_genrand(gRandomSeed);
  startRefValues.resize(threadCount*NumNonAtomicVariablesPerThread());
  if(GenerateRefs(threadCount, &startRefValues[0], d))
  {
    //copy ref values for host threads
    memcpy(&refValues[0], &startRefValues[0], sizeof(HostDataType)*threadCount*NumNonAtomicVariablesPerThread());
  }
  else
  {
    startRefValues.resize(0);
  }
  free_mtdata(d);
  d = NULL;

  // If we're given a num_results function, we need to determine how many result objects we need. If
  // we don't have it, we assume it's just 1
  // This is final value (exact thread count is known in this place)
  numDestItems = NumResults(threadCount, deviceID);

  destItems.resize(numDestItems);
  for(cl_uint i = 0; i < numDestItems; i++)
    destItems[i] = _startValue;

  // Create main buffer with atomic variables (array size dependent on particular test)
  if(UseSVM())
  {
    if(gUseHostPtr)
      svmAtomicBuffer = (HostAtomicType*)malloc(typeSize * numDestItems);
    else
      svmAtomicBuffer = (HostAtomicType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, typeSize * numDestItems, 0);
    if(!svmAtomicBuffer)
    {
      log_error("ERROR: clSVMAlloc failed!\n");
      return -1;
    }
    memcpy(svmAtomicBuffer, &destItems[0], typeSize * numDestItems);
    streams[0] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
                                typeSize * numDestItems, svmAtomicBuffer, NULL);
  }
  else
  {
      streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
                                  typeSize * numDestItems, &destItems[0], NULL);
  }
  if (!streams[0])
  {
    log_error("ERROR: Creating output array failed!\n");
    return -1;
  }
  // Create buffer for per-thread input/output data
  if(UseSVM())
  {
    if(gUseHostPtr)
      svmDataBuffer = (HostDataType*)malloc(typeSize*threadCount*NumNonAtomicVariablesPerThread());
    else
      svmDataBuffer = (HostDataType*)clSVMAlloc(context, CL_MEM_SVM_FINE_GRAIN_BUFFER | (SVMDataBufferAllSVMConsistent() ? CL_MEM_SVM_ATOMICS : 0), typeSize*threadCount*NumNonAtomicVariablesPerThread(), 0);
    if(!svmDataBuffer)
    {
      log_error("ERROR: clSVMAlloc failed!\n");
      return -1;
    }
    if(startRefValues.size())
      memcpy(svmDataBuffer, &startRefValues[0], typeSize*threadCount*NumNonAtomicVariablesPerThread());
    streams[1] = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
                                typeSize * threadCount
                                    * NumNonAtomicVariablesPerThread(),
                                svmDataBuffer, NULL);
  }
  else
  {
      streams[1] = clCreateBuffer(
          context,
          ((startRefValues.size() ? CL_MEM_COPY_HOST_PTR : CL_MEM_READ_WRITE)),
          typeSize * threadCount * NumNonAtomicVariablesPerThread(),
          startRefValues.size() ? &startRefValues[0] : 0, NULL);
  }
  if (!streams[1])
  {
    log_error("ERROR: Creating reference array failed!\n");
    return -1;
  }
  if(deviceThreadCount > 0)
  {
    cl_uint argInd = 0;
    /* Set the arguments */
    error = clSetKernelArg(kernel, argInd++, sizeof(threadCount), &threadCount);
    test_error(error, "Unable to set kernel argument");
    error = clSetKernelArg(kernel, argInd++, sizeof(numDestItems), &numDestItems);
    test_error(error, "Unable to set indexed kernel argument");
    error = clSetKernelArg(kernel, argInd++, sizeof(streams[0]), &streams[0]);
    test_error(error, "Unable to set indexed kernel arguments");
    error = clSetKernelArg(kernel, argInd++, sizeof(streams[1]), &streams[1]);
    test_error(error, "Unable to set indexed kernel arguments");
    if(LocalMemory())
    {
      error = clSetKernelArg(kernel, argInd++, typeSize * numDestItems, NULL);
      test_error(error, "Unable to set indexed local kernel argument");
    }
    if(LocalRefValues())
    {
      error = clSetKernelArg(kernel, argInd++, LocalRefValues() ? typeSize*CurrentGroupSize()*NumNonAtomicVariablesPerThread() : 1, NULL);
      test_error(error, "Unable to set indexed kernel argument");
    }
  }
  /* Configure host threads */
  std::vector<THostThreadContext> hostThreadContexts(hostThreadCount);
  for(unsigned int t = 0; t < hostThreadCount; t++)
  {
    hostThreadContexts[t].test = this;
    hostThreadContexts[t].tid = deviceThreadCount+t;
    hostThreadContexts[t].threadCount = threadCount;
    hostThreadContexts[t].destMemory = UseSVM() ? svmAtomicBuffer : &destItems[0];
    hostThreadContexts[t].oldValues = UseSVM() ? svmDataBuffer : &refValues[0];
  }

  if(deviceThreadCount > 0)
  {
    /* Run the kernel */
    threadNum[0] = deviceThreadCount;
    groupSize = CurrentGroupSize();
    error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, &groupSize, 0, NULL, NULL);
    test_error(error, "Unable to execute test kernel");
    /* start device threads */
    error = clFlush(queue);
    test_error(error, "clFlush failed");
  }

  /* Start host threads and wait for finish */
  if(hostThreadCount > 0)
    ThreadPool_Do(HostThreadFunction, hostThreadCount, &hostThreadContexts[0]);

  if(UseSVM())
  {
    error = clFinish(queue);
    test_error(error, "clFinish failed");
    memcpy(&destItems[0], svmAtomicBuffer, typeSize*numDestItems);
    memcpy(&refValues[0], svmDataBuffer, typeSize*threadCount*NumNonAtomicVariablesPerThread());
  }
  else
  {
    if(deviceThreadCount > 0)
    {
      error = clEnqueueReadBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
      test_error(error, "Unable to read result value!");
      error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize * deviceThreadCount*NumNonAtomicVariablesPerThread(), &refValues[0], 0, NULL, NULL);
      test_error(error, "Unable to read reference values!");
    }
  }
  bool dataVerified = false;
  // If we have an expectedFn, then we need to generate a final value to compare against. If we don't
  // have one, it's because we're comparing ref values only
  for(cl_uint i = 0; i < numDestItems; i++)
  {
    HostDataType expected;

    if(!ExpectedValue(expected, threadCount, startRefValues.size() ? &startRefValues[0] : 0, i))
      break; // no expected value function provided

    if(expected != destItems[i])
    {
      std::stringstream logLine;
      logLine << "ERROR: Result " << i << " from kernel does not validate! (should be " << expected << ", was " << destItems[i] << ")\n";
      log_error("%s", logLine.str().c_str());
      for(i = 0; i < threadCount; i++)
      {
        logLine.str("");
        logLine << " --- " << i << " - ";
        if(startRefValues.size())
          logLine << startRefValues[i] << " -> " << refValues[i];
        else
          logLine << refValues[i];
        logLine << " --- ";
        if(i < numDestItems)
          logLine << destItems[i];
        logLine << "\n";
        log_info("%s", logLine.str().c_str());
      }
      if(!gDebug)
      {
        log_info("Program source:\n");
        log_info("%s\n", programLine);
      }
      return -1;
    }
    dataVerified = true;
  }

  bool dataCorrect = false;
  /* Use the verify function (if provided) to also check the results */
  if(VerifyRefs(dataCorrect, threadCount, &refValues[0], &destItems[0]))
  {
    if(!dataCorrect)
    {
      log_error("ERROR: Reference values did not validate!\n");
      std::stringstream logLine;
      for(cl_uint i = 0; i < threadCount; i++)
      for (cl_uint j = 0; j < NumNonAtomicVariablesPerThread(); j++)
      {
        logLine.str("");
        logLine << " --- " << i << " - " << refValues[i*NumNonAtomicVariablesPerThread()+j] << " --- ";
        if(j == 0 && i < numDestItems)
          logLine << destItems[i];
        logLine << "\n";
        log_info("%s", logLine.str().c_str());
      }
      if(!gDebug)
      {
        log_info("Program source:\n");
        log_info("%s\n", programLine);
      }
      return -1;
    }
  }
  else if(!dataVerified)
  {
    log_error("ERROR: Test doesn't check total or refs; no values are verified!\n");
    return -1;
  }

  if(OldValueCheck() &&
    !(DeclaredInProgram() && !LocalMemory())) // don't test for programs scope global atomics
                                             // 'old' value has been overwritten by previous clEnqueueNDRangeKernel
  {
    /* Re-write the starting value */
    for(size_t i = 0; i < numDestItems; i++)
      destItems[i] = _startValue;
    refValues[0] = 0;
    if(deviceThreadCount > 0)
    {
      error = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, typeSize * numDestItems, &destItems[0], 0, NULL, NULL);
      test_error(error, "Unable to write starting values!");

      /* Run the kernel once for a single thread, so we can verify that the returned value is the original one */
      threadNum[0] = 1;
      error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, threadNum, threadNum, 0, NULL, NULL);
      test_error(error, "Unable to execute test kernel");

      error = clEnqueueReadBuffer(queue, streams[1], CL_TRUE, 0, typeSize, &refValues[0], 0, NULL, NULL);
      test_error(error, "Unable to read reference values!");
    }
    else
    {
      /* Start host thread */
      HostFunction(0, 1, &destItems[0], &refValues[0]);
    }

    if(refValues[0] != _startValue)//destItems[0])
    {
      std::stringstream logLine;
      logLine << "ERROR: atomic function operated correctly but did NOT return correct 'old' value "
        " (should have been " << destItems[0] << ", returned " << refValues[0] << ")!\n";
      log_error("%s", logLine.str().c_str());
      if(!gDebug)
      {
        log_info("Program source:\n");
        log_info("%s\n", programLine);
      }
      return -1;
    }
  }
  if(UseSVM())
  {
    // the buffer object must first be released before the SVM buffer is freed
    error = clReleaseMemObject(streams[0]);
    streams[0] = 0;
    test_error(error, "clReleaseMemObject failed");
    if(gUseHostPtr)
      free(svmAtomicBuffer);
    else
      clSVMFree(context, svmAtomicBuffer);
    error = clReleaseMemObject(streams[1]);
    streams[1] = 0;
    test_error(error, "clReleaseMemObject failed");
    if(gUseHostPtr)
      free(svmDataBuffer);
    else
      clSVMFree(context, svmDataBuffer);
  }
  _passCount++;
  return 0;
}

#endif //_COMMON_H_