Objet : Developers list for StarPU
Archives de la liste
- From: Samuel Thibault <samuel.thibault@ens-lyon.org>
- To: Usman Dastgeer <usman.dastgeer@liu.se>
- Cc: "starpu-devel@lists.gforge.inria.fr" <starpu-devel@lists.gforge.inria.fr>
- Subject: Re: [Starpu-devel] OpenCL problem
- Date: Tue, 20 Dec 2011 02:45:47 +0100
- List-archive: <http://lists.gforge.inria.fr/pipermail/starpu-devel>
- List-id: "Developers list. For discussion of new features, code changes, etc." <starpu-devel.lists.gforge.inria.fr>
Hello,
Usman Dastgeer, le Fri 29 Jul 2011 15:32:44 +0200, a écrit :
> I have this problem with OpenCL. For illustration, I have made a sample
> reduction application. This application works fine when executed on a CPU
> worker but not on an OpenCL worker? It gives the wrong result for the second
> output when doing reduction with partitioning on an OpenCL worker.
At last I got some time to have a look. The only safe solution is to
expose cl_mem + offset to codelets (which is actually the reason for
dev_handle and offset in the interface). I have added the interface to
the trunk, attached is a version of your program which works properly,
using both GET_DEV_HANDLE and GET_OFFSET.
Samuel
#include <iostream>
#include <starpu.h>
#include <starpu_opencl.h>
#include <starpu_deprecated_api.h>
#include <CL/cl.h>
//#include <OpenCL/opencl.h>
static std::string ReduceKernel_CL(
"__kernel void ReduceKernel(__global int* input, unsigned input_offset,
__global int* output, unsigned output_offset, unsigned int n, __local int*
sdata)\n"
"{\n"
" unsigned int blockSize = get_local_size(0);\n"
" unsigned int tid = get_local_id(0);\n"
" unsigned int i = get_group_id(0)*blockSize + get_local_id(0);\n"
" unsigned int gridSize = blockSize*get_num_groups(0);\n"
" int result = 0;\n"
" input = (__global void*) input + input_offset;\n"
" output = (__global void*) output + output_offset;\n"
" if(i < n)\n"
" {\n"
" result = input[i];\n"
" i += gridSize;\n"
" }\n"
" while(i < n)\n"
" {\n"
" result = result + input[i];\n"
" i += gridSize;\n"
" }\n"
" sdata[tid] = result;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if(blockSize >= 512) { if (tid < 256 && tid + 256 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 256) { if (tid < 128 && tid + 128 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 128) { if (tid < 64 && tid + 64 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 64) { if (tid < 32 && tid + 32 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 32]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 32) { if (tid < 16 && tid + 16 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 16]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 16) { if (tid < 8 && tid + 8 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 8]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 8) { if (tid < 4 && tid + 4 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 4]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 4) { if (tid < 2 && tid + 2 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 2]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(blockSize >= 2) { if (tid < 1 && tid + 1 < n) { sdata[tid] =
sdata[tid] + sdata[tid + 1]; } barrier(CLK_LOCAL_MEM_FENCE); }\n"
" if(tid == 0)\n"
" {\n"
" output[get_group_id(0)] = sdata[tid];\n"
" }\n"
"}\n"
);
struct starpu_opencl_program opencl_codelet;
template<typename T>
int min(T a, T b)
{
return (a<b)? a:b;
}
template <typename T>
void opencl_func(void *buffers[], void *_args)
{
unsigned int size;
starpu_vector_interface_t *vector = (starpu_vector_interface_t
*)buffers[0];
cl_mem input = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(vector);
unsigned input_offset = STARPU_VECTOR_GET_OFFSET(vector);
size = STARPU_VECTOR_GET_NX((starpu_vector_interface_t *)buffers[0]);
cl_mem total;
total = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
unsigned total_offset = STARPU_VECTOR_GET_OFFSET(buffers[1]);
cl_int err;
// Setup parameters
unsigned int n = size;
size_t numBlocks;
size_t numThreads;
size_t globalWorkSize[1];
size_t localWorkSize[1];
numThreads = 512;
numBlocks = std::max((int)(size_t) 1, std::min( (int)(n / numThreads),
512));
// Decide size of shared memory
size_t sharedMemSize = sizeof(T) * numThreads;
int id, devid;
cl_kernel kernel;
cl_command_queue queue;
id = starpu_worker_get_id();
devid = starpu_worker_get_devid(id);
cl_context context;
starpu_opencl_get_context(devid, &context);
cl_mem tempTotal;
size_t sizeVec= numBlocks * sizeof(T);
tempTotal = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeVec, NULL,
&err);
if(err != CL_SUCCESS){std::cerr<<"Error allocating memory on OpenCL
device\n";}
unsigned zero;
err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_codelet,
"ReduceKernel", devid);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
// Sets the kernel arguments for first reduction
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&input);
clSetKernelArg(kernel, 1, sizeof(unsigned), (void*)&input_offset);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&tempTotal);
clSetKernelArg(kernel, 3, sizeof(unsigned), (void*)&total_offset);
clSetKernelArg(kernel, 4, sizeof(unsigned int), (void*)&n);
clSetKernelArg(kernel, 5, sharedMemSize, NULL);
globalWorkSize[0] = numBlocks * numThreads;
localWorkSize[0] = numThreads;
// First reduce all elements blockwise so that each block produces one
element.
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
if(err != CL_SUCCESS){std::cerr<<"Error launching kernel !! 1st
"<<err<<"\n";}
// Sets the kernel arguments for second reduction
n = numBlocks;
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tempTotal);
clSetKernelArg(kernel, 1, sizeof(unsigned), (void*)&zero);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&total);
clSetKernelArg(kernel, 3, sizeof(unsigned), (void*)&zero);
clSetKernelArg(kernel, 4, sizeof(unsigned int), (void*)&n);
clSetKernelArg(kernel, 5, sharedMemSize, NULL);
globalWorkSize[0] = 1 * numThreads;
localWorkSize[0] = numThreads;
// Reduces the elements from the previous reduction in a single block to
produce the scalar result.
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
if(err != CL_SUCCESS){std::cerr<<"Error launching kernel!! 2nd\n";}
clFinish(queue);
}
int reduce_cpu(int a, int b)
{
return a+b;
}
template <typename T>
void cpu_func(void *buffers[], void *arg)
{
unsigned int size;
starpu_vector_interface_t *vector = (starpu_vector_interface_t
*)buffers[0];
T* input = (T *)STARPU_VECTOR_GET_PTR(vector);
size = STARPU_VECTOR_GET_NX((starpu_vector_interface_t *)buffers[0]);
T *total;
total = (T *)STARPU_VECTOR_GET_PTR(buffers[1]);
*total = input[0];
for(int ind=1; ind<size; ind++)
{
*total = reduce_cpu(*total, input[ind]);
}
}
//#define ROWS 10
//#define COLS 10
#define SIZE 100
#define PARTS 10
#define EXP_RESULT 200
starpu_codelet codelet;
int main()
{
int *arr= new int[SIZE];
int output=0;
for(int i=0; i<SIZE;i++)
{
arr[i]=2;
}
starpu_init(NULL);
starpu_opencl_load_opencl_from_string(&ReduceKernel_CL[0],
&opencl_codelet, "");
starpu_data_handle vector_handle;
starpu_vector_data_register(&vector_handle, 0, (uintptr_t)arr, SIZE,
sizeof(arr[0]));
codelet.where = STARPU_CPU | STARPU_OPENCL;
codelet.opencl_func = opencl_func<int>;
codelet.cpu_func = cpu_func<int>;
codelet.nbuffers = 2;
/*
* I have the problem when executing on OpenCL workers. If I comment
out this "first reduction without partitioning", then
* the "second reduction with partitioning" will work on OpenCL.
Without commenting out this one, the second reduction with partitioning
* works with CPUs and CUDA but not with OpenCL.
*/
std::cerr<<"*** Doing reduction without partitioning ***\n";
struct starpu_task *task = starpu_task_create();
task->cl = &codelet;
task->buffers[0].handle = vector_handle;
task->buffers[0].mode = STARPU_R;
// Used here for this execution only
starpu_data_handle output_handle;
starpu_vector_data_register(&output_handle, 0, (uintptr_t)&output, 1,
sizeof(output));
task->buffers[1].handle = output_handle;
task->buffers[1].mode = STARPU_W;
task->synchronous = 1;
int ret= starpu_task_submit(task);
if (ret == -ENODEV)
{
fprintf(stderr, "ERROR: No worker may execute this task\n");
return -1;
}
// don't use it anymore, jus display the output
starpu_data_unregister(output_handle);
std::cout<<"output: "<<output<<"\n";
std::cerr<<"*** Doing reduction with partitioning ***\n";
/* partitioning the input vector */
struct starpu_data_filter vector_filter;
memset(&vector_filter, 0, sizeof(vector_filter));
vector_filter.filter_func = starpu_block_filter_func_vector;
vector_filter.nchildren = PARTS;
starpu_data_partition(vector_handle, &vector_filter);
/* creating temporary vector for storing intermediate results */
starpu_data_handle result_handle;
int subResult[PARTS];
starpu_vector_data_register(&result_handle, 0, (uintptr_t)subResult,
PARTS, sizeof(subResult[0]));
/* partitioning the temporary vector */
struct starpu_data_filter result_filter;
memset(&result_filter, 0, sizeof(result_filter));
result_filter.filter_func = starpu_block_filter_func_vector;
result_filter.nchildren = PARTS;
starpu_data_partition(result_handle, &result_filter);
// creating the tasks
for(int i=0;i<PARTS;i++)
{
struct starpu_task *task = starpu_task_create();
task->cl = &codelet;
task->buffers[0].handle =
starpu_data_get_sub_data(vector_handle, 1, i);
task->buffers[0].mode = STARPU_R;
task->buffers[1].handle =
starpu_data_get_sub_data(result_handle, 1, i);
task->buffers[1].mode = STARPU_W;
int ret= starpu_task_submit(task);
if (ret == -ENODEV)
{
fprintf(stderr, "ERROR: No worker may execute this task\n");
return -1;
}
}
// waiting for all tasks to finish
starpu_task_wait_for_all();
// unpartition the vectors first
starpu_data_unpartition(vector_handle, 0);
starpu_data_unpartition(result_handle, 0);
// then unregister the vectors
starpu_data_unregister(vector_handle);
starpu_data_unregister(result_handle);
// accumulate intermediate results
output=subResult[0];
for(int i=1;i<PARTS;i++)
{
output += subResult[i];
}
// display results with partitioning
std::cerr<<"output: "<<output<<"\n";
starpu_opencl_unload_opencl(&opencl_codelet);
starpu_shutdown();
}
- Re: [Starpu-devel] OpenCL problem, Samuel Thibault, 20/12/2011
Archives gérées par MHonArc 2.6.19+.