OpenCL 2.0 Compiler Segmentation Fault or LLVM Error

Hello,

 

I am trying to get a simple dynamic parallelism code to run using OpenCL 2.0 on my system. Unfortunately I get an LLVM error:

Error in hsa_code section, at offset 1572:

Instruction has invalid segment (global), expected one of: group, private

LLVM ERROR:

Brig container validation has failed in BRIGAsmPrinter.cpp

 

Sometimes the OpenCL compile process terminates with a Segmentation Fault...

My system: Ubuntu 14.04 x64, AMD A10-7890K Radeon R7, 12 Compute Cores 4C+8G, dmesg: module loaded - fglrx 15.20.3 [Sep  8 2015] with 1 minors

 

Here is my code:

#include <CL/cl.h>
#include <CL/cl_platform.h>
#include <stddef.h>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <iomanip>
#include <iostream>

using namespace std;

// Use a static data size for simplicity
//
#define DATA_SIZE (16)
const unsigned int data_size = DATA_SIZE;

// Simple compute kernel
//
const char *KernelSource = "\n"
        "#define FRAG_SIZE (16 / 8)                                                \n"
        "__kernel void square(                                                     \n"
        "   __global void* data,                                                   \n"
        "   unsigned int size,                                                     \n"
        "   __global void* error)                                                   \n"
        "{                                                                         \n"
        "   size_t pgid = get_global_id(0);                                        \n"
        "                                                                          \n"
        "   // create 1 dim ndrange                                                \n"
        "    ndrange_t ndrange = ndrange_1D(FRAG_SIZE);                             \n"
        "                                                                          \n"
        "   // store the device's default queue                                    \n"
        "   queue_t default_queue = get_default_queue();                           \n"
        "                                                                          \n"
        "   // declare/define the child kernels code                               \n"
        "   void (^fun_blk)(void) = ^{((__global char*)data)[pgid * FRAG_SIZE + get_global_id(0)] = pgid;}; \n"
        "                                                                          \n"
        "   // enqueue child kernels                                               \n"
        "   *((__global int*)error) = enqueue_kernel(default_queue,                                 \n"
        "      CLK_ENQUEUE_FLAGS_NO_WAIT,                                           \n"
        "      ndrange,                                                             \n"
        "      fun_blk);                                                            \n"
        "}                                                                         \n"
        "\n";

void print(char* data, unsigned int size) {
    for (unsigned int i = 0; i < size; ++i) {
        cout << "[";
        cout << setw(2) << setfill('0') << hex << (int) data[i];
        cout << "]";
    }
    cout << endl;
}

int main(void) {

    cl_int errCPU;                      // error code returned from api calls
    cl_int* errGPU;                      // error code returned from device CL C calls

    size_t global; // global domain size for our calculation
    size_t local;  // local domain size for our calculation

    cl_device_id device_id;  // compute device id
    cl_context context;  // compute context
    cl_command_queue queue_host;  // host's command queue
    cl_command_queue queue_device;  // device's command queue
    cl_program program;  // compute program
    cl_kernel kernel;  // compute kernel

    unsigned int count = 2;
    void* data;

// Connect to a platform
//
    cl_platform_id platforms[2];
    errCPU = clGetPlatformIDs(2, platforms, &count);
    assert(errCPU == CL_SUCCESS);

// Connect to a compute device
//
    errCPU = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
    assert(errCPU == CL_SUCCESS);
    if (errCPU != CL_SUCCESS) {
        printf("Error: Failed to create a device group!\n");
        exit(1);
    }

    const size_t size = 1024;
    char deviceName[size];
    size_t size2 = 0;
    errCPU = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(char) * size, (void*) deviceName, &size2);
    string strDeviceName(deviceName);
    cout << strDeviceName << endl;

    // Create a compute context
    //
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &errCPU);
    assert(errCPU == CL_SUCCESS);
    if (!context) {
        printf("Error: Failed to create a compute context!\n");
        exit(1);
    }

    // Fill our data set with random float values
    //
    data = clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, DATA_SIZE, 0);
    assert(data != nullptr);

    int i = 0;
    for (i = 0; i < DATA_SIZE; i++)
        reinterpret_cast<char*>(data)[i] = 'A';

    // Allocate SVM error memory
    //
    errGPU = reinterpret_cast<cl_int*>(clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, sizeof(cl_int), 0));

    // Create host's command queues
    //
    cl_queue_properties props_host[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0, 0 };
    queue_host = clCreateCommandQueueWithProperties(context, device_id, props_host, &errCPU);
    assert(errCPU == CL_SUCCESS);

    // Create device's command queues
    //
    cl_queue_properties props_device[] = { CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT, 0, 0 };
    queue_device = clCreateCommandQueueWithProperties(context, device_id, props_device, &errCPU);
    assert(errCPU == CL_SUCCESS);

    if (!queue_host) {
        printf("Error: Failed to create a command commands!\n");
        exit(1);
    }

    // Create the compute program from the source buffer
    //
    program = clCreateProgramWithSource(context, 1, (const char **) &KernelSource, NULL, &errCPU);
    assert(errCPU == CL_SUCCESS);

    if (!program) {
        printf("Error: Failed to create compute program!\n");
        exit(1);
    }

    // Build the program executable
    //
    errCPU = clBuildProgram(program, 0, NULL, "-cl-opt-disable -cl-std=CL2.0 -g -Werror", NULL, NULL);

    if (errCPU != CL_SUCCESS) {
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    // Create the compute kernel in the program we wish to run
    //
    kernel = clCreateKernel(program, "square", &errCPU);
    assert(errCPU == CL_SUCCESS);

    if (!kernel || errCPU != CL_SUCCESS) {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }

// Set the arguments to our compute kernel
//
    errCPU = 0;
    errCPU |= clSetKernelArgSVMPointer(kernel, 0, data);
    errCPU |= clSetKernelArg(kernel, 1, sizeof(data_size), &data_size);
    errCPU |= clSetKernelArgSVMPointer(kernel, 2, errGPU);
    if (errCPU != CL_SUCCESS) {
        printf("Error: Failed to set kernel arguments! %d\n", errCPU);
        exit(1);
    }

// Get the maximum work group size for executing the kernel on the device
//
    errCPU = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (errCPU != CL_SUCCESS) {
        printf("Error: Failed to retrieve kernel work group info! %d\n", errCPU);
        exit(1);
    }
    cout << "CL_KERNEL_WORK_GROUP_SIZE: " << local << endl;

// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
    global = 8;
    local = 8;
    errCPU = clEnqueueNDRangeKernel(queue_host, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (errCPU) {
        printf("Error: Failed to execute kernel!\n");
        exit(1);
    }

// Wait for the command commands to get serviced before reading back results
//
    clFinish(queue_host);

// Validate our results
//
// TODO
    print((char*) data, data_size);

// Shutdown and cleanup
//
    clSVMFree(context, data);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue_host);
    clReleaseContext(context);
    getchar();
    return 0;
}

 

has anyone an idea?

 

Thanks for your help.