starpu-devel - Re: [Starpu-devel] OpenCL problem

Objet : Developers list for StarPU

Archives de la liste

Re: [Starpu-devel] OpenCL problem

From: Samuel Thibault <samuel.thibault@ens-lyon.org>
To: Usman Dastgeer <usman.dastgeer@liu.se>
Cc: "starpu-devel@lists.gforge.inria.fr" <starpu-devel@lists.gforge.inria.fr>
Subject: Re: [Starpu-devel] OpenCL problem
Date: Tue, 20 Dec 2011 02:45:47 +0100
List-archive: <http://lists.gforge.inria.fr/pipermail/starpu-devel>
List-id: "Developers list. For discussion of new features, code changes, etc." <starpu-devel.lists.gforge.inria.fr>

Hello,

Usman Dastgeer, le Fri 29 Jul 2011 15:32:44 +0200, a écrit :
> I have this problem with OpenCL. For illustration, I have made a sample
> reduction application. This application works fine when executed on a CPU
> worker but not on an OpenCL worker? It gives the wrong result for the second
> output when doing reduction with partitioning on an OpenCL worker.

At last I got some time to have a look. The only safe solution is to
expose cl_mem + offset to codelets (which is actually the reason for
dev_handle and offset in the interface). I have added the interface to
the trunk, attached is a version of your program which works properly,
using both GET_DEV_HANDLE and GET_OFFSET.

Samuel
#include <iostream>
#include <starpu.h>
#include <starpu_opencl.h>
#include <starpu_deprecated_api.h>
#include <CL/cl.h>
//#include <OpenCL/opencl.h>

static std::string ReduceKernel_CL(
"__kernel void ReduceKernel(__global int* input, unsigned input_offset,
__global int* output, unsigned output_offset, unsigned int n, __local int*
sdata)\n"
"{\n"
" unsigned int blockSize = get_local_size(0);\n"
" unsigned int tid = get_local_id(0);\n"
" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
" unsigned int gridSize = blockSize*get_num_groups(0);\n"
" int result = 0;\n"
" input = (__global void*) input + input_offset;\n"
" output = (__global void*) output + output_offset;\n"
" if(i < n)\n"
" {\n"
" result = input[i];\n"
" i += gridSize;\n"
" }\n"
" while(i < n)\n"
" {\n"
" result = result + input[i];\n"
" i += gridSize;\n"
" }\n"
" sdata[tid] = result;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 32]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 16]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 8]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 4]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 2]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 1]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(tid == 0)\n"
" {\n"
" output[get_group_id(0)] = sdata[tid];\n"
" }\n"
"}\n"
);

struct starpu_opencl_program opencl_codelet;

template<typename T>
int min(T a, T b)
{
return (a<b)? a:b;
}

template <typename T>
void opencl_func(void *buffers[], void *_args)
{
unsigned int size;

starpu_vector_interface_t *vector = (starpu_vector_interface_t
*)buffers[0];
cl_mem input = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(vector);
unsigned input_offset = STARPU_VECTOR_GET_OFFSET(vector);

size = STARPU_VECTOR_GET_NX((starpu_vector_interface_t *)buffers[0]);

cl_mem total;
total = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
unsigned total_offset = STARPU_VECTOR_GET_OFFSET(buffers[1]);

cl_int err;

// Setup parameters
unsigned int n = size;
size_t numBlocks;
size_t numThreads;
size_t globalWorkSize[1];
size_t localWorkSize[1];

numThreads = 512;
numBlocks = std::max((int)(size_t) 1, std::min( (int)(n / numThreads),
512));

// Decide size of shared memory
size_t sharedMemSize = sizeof(T) * numThreads;

int id, devid;
cl_kernel kernel;
cl_command_queue queue;

id = starpu_worker_get_id();
devid = starpu_worker_get_devid(id);

cl_context context;
starpu_opencl_get_context(devid, &context);

cl_mem tempTotal;
size_t sizeVec= numBlocks * sizeof(T);
tempTotal = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeVec, NULL,
&err);
if(err != CL_SUCCESS){std::cerr<<"Error allocating memory on OpenCL
device\n";}
unsigned zero;

err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_codelet,
"ReduceKernel", devid);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

// Sets the kernel arguments for first reduction
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&input);
clSetKernelArg(kernel, 1, sizeof(unsigned), (void*)&input_offset);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&tempTotal);
clSetKernelArg(kernel, 3, sizeof(unsigned), (void*)&total_offset);
clSetKernelArg(kernel, 4, sizeof(unsigned int), (void*)&n);
clSetKernelArg(kernel, 5, sharedMemSize, NULL);

globalWorkSize[0] = numBlocks * numThreads;
localWorkSize[0] = numThreads;

// First reduce all elements blockwise so that each block produces one
element.
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
if(err != CL_SUCCESS){std::cerr<<"Error launching kernel !! 1st
"<<err<<"\n";}

// Sets the kernel arguments for second reduction
n = numBlocks;
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tempTotal);
clSetKernelArg(kernel, 1, sizeof(unsigned), (void*)&zero);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&total);
clSetKernelArg(kernel, 3, sizeof(unsigned), (void*)&zero);
clSetKernelArg(kernel, 4, sizeof(unsigned int), (void*)&n);
clSetKernelArg(kernel, 5, sharedMemSize, NULL);

globalWorkSize[0] = 1 * numThreads;
localWorkSize[0] = numThreads;

// Reduces the elements from the previous reduction in a single block to
produce the scalar result.
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
if(err != CL_SUCCESS){std::cerr<<"Error launching kernel!! 2nd\n";}

clFinish(queue);
}

int reduce_cpu(int a, int b)
{
return a+b;
}

template <typename T>
void cpu_func(void *buffers[], void *arg)
{
unsigned int size;

starpu_vector_interface_t *vector = (starpu_vector_interface_t
*)buffers[0];
T* input = (T *)STARPU_VECTOR_GET_PTR(vector);

size = STARPU_VECTOR_GET_NX((starpu_vector_interface_t *)buffers[0]);

T *total;
total = (T *)STARPU_VECTOR_GET_PTR(buffers[1]);

*total = input[0];

for(int ind=1; ind<size; ind++)
{
*total = reduce_cpu(*total, input[ind]);
}
}

//#define ROWS 10
//#define COLS 10

#define SIZE 100
#define PARTS 10

#define EXP_RESULT 200

starpu_codelet codelet;

int main()
{
int *arr= new int[SIZE];
int output=0;

for(int i=0; i<SIZE;i++)
{
arr[i]=2;
}

starpu_init(NULL);

starpu_opencl_load_opencl_from_string(&ReduceKernel_CL[0],
&opencl_codelet, "");

starpu_data_handle vector_handle;
starpu_vector_data_register(&vector_handle, 0, (uintptr_t)arr, SIZE,
sizeof(arr[0]));

codelet.where = STARPU_CPU | STARPU_OPENCL;
codelet.opencl_func = opencl_func<int>;
codelet.cpu_func = cpu_func<int>;
codelet.nbuffers = 2;

/*
* I have the problem when executing on OpenCL workers. If I comment
out this "first reduction without partitioning", then
* the "second reduction with partitioning" will work on OpenCL.
Without commenting out this one, the second reduction with partitioning
* works with CPUs and CUDA but not with OpenCL.
*/
std::cerr<<"*** Doing reduction without partitioning ***\n";

struct starpu_task *task = starpu_task_create();
task->cl = &codelet;

task->buffers[0].handle = vector_handle;
task->buffers[0].mode = STARPU_R;

// Used here for this execution only
starpu_data_handle output_handle;
starpu_vector_data_register(&output_handle, 0, (uintptr_t)&output, 1,
sizeof(output));

task->buffers[1].handle = output_handle;
task->buffers[1].mode = STARPU_W;

task->synchronous = 1;

int ret= starpu_task_submit(task);
if (ret == -ENODEV)
{
fprintf(stderr, "ERROR: No worker may execute this task\n");
return -1;
}

// don't use it anymore, jus display the output
starpu_data_unregister(output_handle);
std::cout<<"output: "<<output<<"\n";

std::cerr<<"*** Doing reduction with partitioning ***\n";

/* partitioning the input vector */
struct starpu_data_filter vector_filter;
memset(&vector_filter, 0, sizeof(vector_filter));
vector_filter.filter_func = starpu_block_filter_func_vector;
vector_filter.nchildren = PARTS;
starpu_data_partition(vector_handle, &vector_filter);

/* creating temporary vector for storing intermediate results */
starpu_data_handle result_handle;
int subResult[PARTS];
starpu_vector_data_register(&result_handle, 0, (uintptr_t)subResult,
PARTS, sizeof(subResult[0]));

/* partitioning the temporary vector */
struct starpu_data_filter result_filter;
memset(&result_filter, 0, sizeof(result_filter));
result_filter.filter_func = starpu_block_filter_func_vector;
result_filter.nchildren = PARTS;
starpu_data_partition(result_handle, &result_filter);

// creating the tasks
for(int i=0;i<PARTS;i++)
{
struct starpu_task *task = starpu_task_create();
task->cl = &codelet;

task->buffers[0].handle =
starpu_data_get_sub_data(vector_handle, 1, i);
task->buffers[0].mode = STARPU_R;

task->buffers[1].handle =
starpu_data_get_sub_data(result_handle, 1, i);
task->buffers[1].mode = STARPU_W;

int ret= starpu_task_submit(task);
if (ret == -ENODEV)
{
fprintf(stderr, "ERROR: No worker may execute this task\n");
return -1;
}
}

// waiting for all tasks to finish
starpu_task_wait_for_all();

// unpartition the vectors first
starpu_data_unpartition(vector_handle, 0);
starpu_data_unpartition(result_handle, 0);

// then unregister the vectors
starpu_data_unregister(vector_handle);
starpu_data_unregister(result_handle);

// accumulate intermediate results
output=subResult[0];
for(int i=1;i<PARTS;i++)
{
output += subResult[i];
}

// display results with partitioning
std::cerr<<"output: "<<output<<"\n";

starpu_opencl_unload_opencl(&opencl_codelet);

starpu_shutdown();
}

Re: [Starpu-devel] OpenCL problem, Samuel Thibault, 20/12/2011

Archives gérées par MHonArc 2.6.19+.

Archives de la liste

Re: [Starpu-devel] OpenCL problem