// // Copyright (c) 2020 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include "procs.h" #include "subhelpers.h" #include "harness/conversions.h" #include "harness/typeWrappers.h" // These need to stay in sync with the kernel source below #define NUM_LOC 49 #define INST_LOC_MASK 0x7f #define INST_OP_SHIFT 0 #define INST_OP_MASK 0xf #define INST_LOC_SHIFT 4 #define INST_VAL_SHIFT 12 #define INST_VAL_MASK 0x7ffff #define INST_END 0x0 #define INST_STORE 0x1 #define INST_WAIT 0x2 #define INST_COUNT 0x3 static const char *ifp_source = "#define NUM_LOC 49\n" "#define INST_LOC_MASK 0x7f\n" "#define INST_OP_SHIFT 0\n" "#define INST_OP_MASK 0xf\n" "#define INST_LOC_SHIFT 4\n" "#define INST_VAL_SHIFT 12\n" "#define INST_VAL_MASK 0x7ffff\n" "#define INST_END 0x0\n" "#define INST_STORE 0x1\n" "#define INST_WAIT 0x2\n" "#define INST_COUNT 0x3\n" "\n" "__kernel void\n" "test_ifp(const __global int *in, __global int4 *xy, __global int *out)\n" "{\n" " __local atomic_int loc[NUM_LOC];\n" "\n" " // Don't run if there is only one sub group\n" " if (get_num_sub_groups() == 1)\n" " return;\n" "\n" " // First initialize loc[]\n" " int lid = (int)get_local_id(0);\n" "\n" " if (lid < NUM_LOC)\n" " atomic_init(loc+lid, 0);\n" "\n" " work_group_barrier(CLK_LOCAL_MEM_FENCE);\n" "\n" " // Compute pointer to this sub group's \"instructions\"\n" " const __global int *pc = in +\n" " ((int)get_group_id(0)*(int)get_enqueued_num_sub_groups() +\n" " (int)get_sub_group_id()) *\n" " (NUM_LOC+1);\n" "\n" " // Set up to \"run\"\n" " bool ok = (int)get_sub_group_local_id() == 0;\n" " bool run = true;\n" "\n" " while (run) {\n" " int inst = *pc++;\n" " int iop = (inst >> INST_OP_SHIFT) & INST_OP_MASK;\n" " int iloc = (inst >> INST_LOC_SHIFT) & INST_LOC_MASK;\n" " int ival = (inst >> INST_VAL_SHIFT) & INST_VAL_MASK;\n" "\n" " switch (iop) {\n" " case INST_STORE:\n" " if (ok)\n" " atomic_store(loc+iloc, ival);\n" " break;\n" " case INST_WAIT:\n" " if (ok) {\n" " while (atomic_load(loc+iloc) != ival)\n" " ;\n" " }\n" " break;\n" " case INST_COUNT:\n" " if (ok) {\n" " int i;\n" " for (i=0;i NUM_LOC // Input needs num_groups * num_sub_groups * (NUM_LOC+1) elements static inline int inst(int op, int loc, int val) { return (val << INST_VAL_SHIFT) | (loc << INST_LOC_SHIFT) | (op << INST_OP_SHIFT); } void gen_insts(cl_int *x, cl_int *p, int n) { int i, j0, j1; int val; int ii[NUM_LOC]; // Create a random permutation of 0...NUM_LOC-1 ii[0] = 0; for (i = 1; i < NUM_LOC; ++i) { j0 = random_in_range(0, i, gMTdata); if (j0 != i) ii[i] = ii[j0]; ii[j0] = i; } // Initialize "instruction pointers" memset(p, 0, n * 4); for (i = 0; i < NUM_LOC; ++i) { // Randomly choose 2 different sub groups // One does a random amount of work, and the other waits for it j0 = random_in_range(0, n - 1, gMTdata); do { j1 = random_in_range(0, n - 1, gMTdata); } while (j1 == j0); // Randomly choose a wait value and assign "instructions" val = random_in_range(100, 200 + 10 * NUM_LOC, gMTdata); x[j0 * (NUM_LOC + 1) + p[j0]] = inst(INST_COUNT, ii[i], val); x[j1 * (NUM_LOC + 1) + p[j1]] = inst(INST_WAIT, ii[i], val); ++p[j0]; ++p[j1]; } // Last "inst" for each sub group is END for (i = 0; i < n; ++i) x[i * (NUM_LOC + 1) + p[i]] = inst(INST_END, 0, 0); } // Execute one group's "instructions" void run_insts(cl_int *x, cl_int *p, int n) { int i, nend; bool scont; cl_int loc[NUM_LOC]; // Initialize result and "instruction pointers" memset(loc, 0, sizeof(loc)); memset(p, 0, 4 * n); // Repetitively loop over subgroups with each executing "instructions" until // blocked The loop terminates when all subgroups have hit the "END // instruction" do { nend = 0; for (i = 0; i < n; ++i) { do { cl_int inst = x[i * (NUM_LOC + 1) + p[i]]; cl_int iop = (inst >> INST_OP_SHIFT) & INST_OP_MASK; cl_int iloc = (inst >> INST_LOC_SHIFT) & INST_LOC_MASK; cl_int ival = (inst >> INST_VAL_SHIFT) & INST_VAL_MASK; scont = false; switch (iop) { case INST_STORE: loc[iloc] = ival; ++p[i]; scont = true; break; case INST_WAIT: if (loc[iloc] == ival) { ++p[i]; scont = true; } break; case INST_COUNT: loc[iloc] += ival; ++p[i]; scont = true; break; case INST_END: ++nend; break; } } while (scont); } } while (nend < n); // Return result, reusing "p" memcpy(p, loc, sizeof(loc)); } struct IFP { static void gen(cl_int *x, cl_int *t, cl_int *, const WorkGroupParams &test_params) { int k; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; int nj = (nw + ns - 1) / ns; ng = ng / nw; // We need at least 2 sub groups per group for this test if (nj == 1) return; for (k = 0; k < ng; ++k) { gen_insts(x, t, nj); x += nj * (NUM_LOC + 1); } } static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *, const WorkGroupParams &test_params) { int i, k; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; int nj = (nw + ns - 1) / ns; ng = ng / nw; // We need at least 2 sub groups per group for this tes if (nj == 1) return 0; log_info(" independent forward progress...\n"); for (k = 0; k < ng; ++k) { run_insts(x, t, nj); for (i = 0; i < NUM_LOC; ++i) { if (t[i] != y[i]) { log_error( "ERROR: mismatch at element %d in work group %d\n", i, k); return -1; } } x += nj * (NUM_LOC + 1); y += NUM_LOC; } return 0; } }; int test_ifp(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements, bool useCoreSubgroups) { int error = TEST_PASS; // Global/local work group sizes // Adjust these individually below if desired/needed constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; WorkGroupParams test_params(global_work_size, local_work_size); test_params.use_core_subgroups = useCoreSubgroups; test_params.dynsc = NUM_LOC + 1; error = test::run(device, context, queue, num_elements, "test_ifp", ifp_source, test_params); return error; } static test_status checkIFPSupport(cl_device_id device, bool &ifpSupport) { cl_uint ifp_supported; cl_uint error; error = clGetDeviceInfo(device, CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, sizeof(ifp_supported), &ifp_supported, NULL); if (error != CL_SUCCESS) { print_error( error, "Unable to get CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS " "capability"); return TEST_FAIL; } // skip testing ifp if (ifp_supported != 1) { log_info("INDEPENDENT FORWARD PROGRESS not supported...\n"); ifpSupport = false; } else { log_info("INDEPENDENT FORWARD PROGRESS supported...\n"); ifpSupport = true; } return TEST_PASS; } int test_ifp_core(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) { bool ifpSupport = true; test_status error; error = checkIFPSupport(device, ifpSupport); if (error != TEST_PASS) { return error; } if (ifpSupport == false) { log_info("Independed forward progress skipped.\n"); return TEST_SKIPPED_ITSELF; } return test_ifp(device, context, queue, num_elements, true); } int test_ifp_ext(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) { bool hasExtension = is_extension_available(device, "cl_khr_subgroups"); bool ifpSupport = true; if (!hasExtension) { log_info( "Device does not support 'cl_khr_subgroups'. Skipping the test.\n"); return TEST_SKIPPED_ITSELF; } // ifp only in subgroup functions tests: test_status error; error = checkIFPSupport(device, ifpSupport); if (error != TEST_PASS) { return error; } if (ifpSupport == false) { log_info( "Error reason: the extension cl_khr_subgroups requires that " "Independed forward progress has to be supported by device.\n"); return TEST_FAIL; } return test_ifp(device, context, queue, num_elements, false); }