Objet : Developers list for StarPU
Archives de la liste
- From: Новак Александр <sasha-novak@yandex.ru>
- To: "starpu-devel@lists.gforge.inria.fr" <starpu-devel@lists.gforge.inria.fr>
- Subject: [Starpu-devel] data access
- Date: Sat, 11 Nov 2017 14:43:51 +0300
- Authentication-results: mail2-smtp-roc.national.inria.fr; spf=None smtp.pra=nathalie.furmento@labri.fr; spf=None smtp.mailfrom=nathalie.furmento@labri.fr; spf=None smtp.helo=postmaster@v-zimmta03.u-bordeaux.fr
- Ironport-phdr: 9a23:FIp0kxVTFgPIKsWSPERcS+TETbrV8LGtZVwlr6E/grcLSJyIuqrYZhWBt8tkgFKBZ4jH8fUM07OQ6P+wHzFYqb+681k8M7V0HycfjssXmwFySOWkMmbcaMDQUiohAc5ZX0Vk9XzoeWJcGcL5ekGA6ibqtW1aSV3DMl8/Pfj8AJbPysi6ye2205nSeBlTwjWzZql9IVO3qx/Qv48Ym8Eqfr0twwHRvz5EdvpbwUtsJEmPhFDz6MCq85Ml8iJKuvtn+dQWAovgeKFtbbtyBd1uGok4DcbitfDbBV+U72AcWyAQlQRBACDU5Rf2U4e3qDas5bk14zWTIcCjFeN8Yj+l9ao+EBI=
- Ironport-phdr: 9a23:ZOyYLxaFsSQ5SaFQgdztxY//LSx+4OfEezUN459isYplN5qZpc28bnLW6fgltlLVR4KTs6sC0LWG9f24EUU7or+/81k6OKRWUBEEjchE1ycBO+WiTXPBEfjxciYhF95DXlI2t1uyMExSBdqsLwaK+i76vnYuHUC1LhZ8PPzoX4Lfkcmz/+Sz4IHIJQpGgyCybPVzKg+3pEPfrINe1ZB+I7wp117Fr2VFf8xSxHh0PhSckRHm6cr2/Zh58i0Wteh3pOBaVqCvX6MSRFUQJd0r2ms85CTz/U3YRBCP7z4aVXsfkTJZCg7J4QG8RIqn4XiyjfZ0xCTPZZ6+drszQzn3t6o=
- List-archive: <http://lists.gforge.inria.fr/pipermail/starpu-devel/>
- List-id: "Developers list. For discussion of new features, code changes, etc." <starpu-devel.lists.gforge.inria.fr>
- Resent-date: Mon, 13 Nov 2017 10:20:48 +0100
- Resent-from: Nathalie Furmento <nathalie.furmento@labri.fr>
- Resent-message-id: <20171113092048.4m2brk5ka7xtvfxb@esterel>
- Resent-to: starpu-devel@lists.gforge.inria.fr
Hello!
I'm using StarPU in my project, but i have some problems with data management.
How i can say StarPU that i take responsibility for data independence?
For example, i have classical matrix multiplication. C=A*B
I want to do a parallel calculation row matrix C. In example you partitioning matrix C, and get subdata for each task, but I want send full matrix A B C for each task,
and within each task calculate the necessary offset (see attach). Now, starPU think, that all tasks depend on each other and calculates sequentially.
In general, I want that each task receive the whole memory block and decide on where it needs to be written. Is this possible?
Please could you help me?
--
Best regards,
Alexander Novak
#include <math.h> #include <vector> #include <set> #include <list> #include <stdio.h> #include <stdlib.h> #include <starpu.h> #include <starpu_fxt.h> void multStarPU(void *buffers[], void *cl_arg) { struct starpu_vector_interface *vec_A = (struct starpu_vector_interface *) buffers[0]; struct starpu_vector_interface *vec_B = (struct starpu_vector_interface *) buffers[1]; struct starpu_vector_interface *vec_C = (struct starpu_vector_interface *) buffers[2]; double *A = (double*)STARPU_VECTOR_GET_PTR(vec_A); double *B = (double*)STARPU_VECTOR_GET_PTR(vec_B); double *C = (double*)STARPU_VECTOR_GET_PTR(vec_C); int *args = (int *) cl_arg; int M = args[0]; int N = args[1]; int L = args[2]; int i = args[3]; for(int j = 0; j < N; j++) { C[j] = 0; for(int k = 0; k < L; k++) { C[j] += (A[i * L + k] * B[k * N + j]); } } } void multStarPU_fullC(void *buffers[], void *cl_arg) { struct starpu_vector_interface *vec_A = (struct starpu_vector_interface *) buffers[0]; struct starpu_vector_interface *vec_B = (struct starpu_vector_interface *) buffers[1]; struct starpu_vector_interface *vec_C = (struct starpu_vector_interface *) buffers[2]; double *A = (double*)STARPU_VECTOR_GET_PTR(vec_A); double *B = (double*)STARPU_VECTOR_GET_PTR(vec_B); double *C = (double*)STARPU_VECTOR_GET_PTR(vec_C); int *args = (int *) cl_arg; int M = args[0]; int N = args[1]; int L = args[2]; int i = args[3]; for(int j = 0; j < N; j++) { C[i* N + j] = 0; for(int k = 0; k < L; k++) { C[i* N + j] += (A[i * L + k] * B[k * N + j]); } } } static struct starpu_perfmodel matrixmult_model = { .type = STARPU_HISTORY_BASED, .symbol = "matrixmult" }; static struct starpu_codelet cl_matrixmult = { /* CPU implementation of the codelet */ .cpu_funcs = { multStarPU }, .cpu_funcs_name = { "matrixMult" }, .nbuffers = 3, .modes = {STARPU_R, STARPU_R, STARPU_RW}, .model = &matrixmult_model, }; int main() { int M = 2000; int N = 2000; int L = 2000; double* A = (double *) malloc(M*L*sizeof(double)); double* B = (double *) malloc(N*L*sizeof(double)); double* C = (double *) malloc(M*N*sizeof(double)); double* C_ref = new double [M * N]; for(int i = 0; i < M*N;i++) { A[i] = i%100; B[i] = i%100; } int ret = starpu_init(NULL); //This code doesn't work parallel because all task get full matrix C. //Kernel for this function is multStarPU_fullC // starpu_data_handle_t vector_A, vector_B, vector_C; // starpu_vector_data_register(&vector_A, STARPU_MAIN_RAM, (uintptr_t)A, M * L, sizeof(double)); // starpu_vector_data_register(&vector_B, STARPU_MAIN_RAM, (uintptr_t)B, L * N, sizeof(double)); // starpu_vector_data_register(&vector_C, STARPU_MAIN_RAM, (uintptr_t)C, M * N, sizeof(double)); // for(int i = 0; i < M; i++) { // struct starpu_task *task_mult = starpu_task_create(); // task_mult->cl = &cl_matrixmult; // int *mult_params = new int[4]; // mult_params[0] = M; // mult_params[1] = N; // mult_params[2] = L; // mult_params[3] = i; // task_mult->handles[0] = vector_A; // task_mult->handles[1] = vector_B; // task_mult->handles[2] = vector_C; // task_mult->cl_arg = mult_params; // task_mult->cl_arg_size = sizeof(int); // ret = starpu_task_submit(task_mult); // } starpu_data_handle_t vector_A, vector_B; starpu_vector_data_register(&vector_A, STARPU_MAIN_RAM, (uintptr_t)A, M * L, sizeof(double)); starpu_vector_data_register(&vector_B, STARPU_MAIN_RAM, (uintptr_t)B, L * N, sizeof(double)); double start; double end; start = starpu_timing_now(); //This code work parallel, because i split matrix C for(int i = 0; i < M; i++) { starpu_data_handle_t vector_C; starpu_vector_data_register(&vector_C, STARPU_MAIN_RAM, (uintptr_t)(C + i * N), N, sizeof(double)); struct starpu_task *task_mult = starpu_task_create(); task_mult->cl = &cl_matrixmult; int *mult_params = new int[4]; mult_params[0] = M; mult_params[1] = N; mult_params[2] = L; mult_params[3] = i; task_mult->handles[0] = vector_A; task_mult->handles[1] = vector_B; task_mult->handles[2] = vector_C; task_mult->cl_arg = mult_params; task_mult->cl_arg_size = sizeof(int); ret = starpu_task_submit(task_mult); } starpu_task_wait_for_all(); end = starpu_timing_now(); double timing = end - start; printf("Total timing : %2.2lf ms\n", timing/1000); double start_ref; double end_ref; start_ref = starpu_timing_now(); for(int i = 0; i < M; i++) { for(int j = 0; j < N; j++) { C_ref[i * N + j] = 0; for(int k = 0; k < L; k++) { C_ref[i * N + j] += (A[i * L + k] * B[k * N + j]); } } } end_ref = starpu_timing_now(); double timing_ref = end_ref - start_ref; printf("Total timing ref: %2.2lf ms\n", timing_ref/1000); double max_dif = -1000; for(int i = 0; i < M * N; i++) { if(fabs(C_ref[i] - C[i]) > max_dif) { max_dif = fabs(C_ref[i] - C[i]); } } printf("MAX_DIF = %lf \n", max_dif); return 0; }
- [Starpu-devel] data access, Новак Александр, 13/11/2017
- Re: [Starpu-devel] data access, Samuel Thibault, 13/11/2017
Archives gérées par MHonArc 2.6.19+.