Accéder au contenu.
Menu Sympa

starpu-devel - [Starpu-devel] data access

Objet : Developers list for StarPU

Archives de la liste

[Starpu-devel] data access


Chronologique Discussions 
  • From: Новак Александр <sasha-novak@yandex.ru>
  • To: "starpu-devel@lists.gforge.inria.fr" <starpu-devel@lists.gforge.inria.fr>
  • Subject: [Starpu-devel] data access
  • Date: Sat, 11 Nov 2017 14:43:51 +0300
  • Authentication-results: mail2-smtp-roc.national.inria.fr; spf=None smtp.pra=nathalie.furmento@labri.fr; spf=None smtp.mailfrom=nathalie.furmento@labri.fr; spf=None smtp.helo=postmaster@v-zimmta03.u-bordeaux.fr
  • Ironport-phdr: 9a23:FIp0kxVTFgPIKsWSPERcS+TETbrV8LGtZVwlr6E/grcLSJyIuqrYZhWBt8tkgFKBZ4jH8fUM07OQ6P+wHzFYqb+681k8M7V0HycfjssXmwFySOWkMmbcaMDQUiohAc5ZX0Vk9XzoeWJcGcL5ekGA6ibqtW1aSV3DMl8/Pfj8AJbPysi6ye2205nSeBlTwjWzZql9IVO3qx/Qv48Ym8Eqfr0twwHRvz5EdvpbwUtsJEmPhFDz6MCq85Ml8iJKuvtn+dQWAovgeKFtbbtyBd1uGok4DcbitfDbBV+U72AcWyAQlQRBACDU5Rf2U4e3qDas5bk14zWTIcCjFeN8Yj+l9ao+EBI=
  • Ironport-phdr: 9a23:ZOyYLxaFsSQ5SaFQgdztxY//LSx+4OfEezUN459isYplN5qZpc28bnLW6fgltlLVR4KTs6sC0LWG9f24EUU7or+/81k6OKRWUBEEjchE1ycBO+WiTXPBEfjxciYhF95DXlI2t1uyMExSBdqsLwaK+i76vnYuHUC1LhZ8PPzoX4Lfkcmz/+Sz4IHIJQpGgyCybPVzKg+3pEPfrINe1ZB+I7wp117Fr2VFf8xSxHh0PhSckRHm6cr2/Zh58i0Wteh3pOBaVqCvX6MSRFUQJd0r2ms85CTz/U3YRBCP7z4aVXsfkTJZCg7J4QG8RIqn4XiyjfZ0xCTPZZ6+drszQzn3t6o=
  • List-archive: <http://lists.gforge.inria.fr/pipermail/starpu-devel/>
  • List-id: "Developers list. For discussion of new features, code changes, etc." <starpu-devel.lists.gforge.inria.fr>
  • Resent-date: Mon, 13 Nov 2017 10:20:48 +0100
  • Resent-from: Nathalie Furmento <nathalie.furmento@labri.fr>
  • Resent-message-id: <20171113092048.4m2brk5ka7xtvfxb@esterel>
  • Resent-to: starpu-devel@lists.gforge.inria.fr

Hello!
I'm using StarPU in my project, but i have some problems with data management.
 
How i can say StarPU that i take responsibility for data independence?
For example, i have classical matrix multiplication. C=A*B
I want to do a parallel calculation row matrix C. In example you partitioning matrix C, and get subdata for each task, but I want send full matrix A B C for each task,
and within each task calculate the necessary offset (see attach). Now, starPU think, that all tasks depend on each other and calculates sequentially.
 
In general, I want that each task receive the whole memory block and decide on where it needs to be written. Is this possible?
 
Please could you help me?
 
-- 
Best regards,
Alexander Novak
 
#include <math.h>
#include <vector>
#include <set>

#include <list>
#include <stdio.h>
#include <stdlib.h>


#include <starpu.h>
#include <starpu_fxt.h>


void multStarPU(void *buffers[], void *cl_arg)
{

	struct starpu_vector_interface *vec_A = (struct starpu_vector_interface *) buffers[0];
	struct starpu_vector_interface *vec_B = (struct starpu_vector_interface *) buffers[1];
	struct starpu_vector_interface *vec_C = (struct starpu_vector_interface *) buffers[2];

	double *A = (double*)STARPU_VECTOR_GET_PTR(vec_A);
	double *B = (double*)STARPU_VECTOR_GET_PTR(vec_B);
	double *C = (double*)STARPU_VECTOR_GET_PTR(vec_C);

	int *args = (int *) cl_arg;
	int M = args[0];
	int N = args[1];
	int L = args[2];
	int i = args[3];

	for(int j = 0; j < N; j++) {
		C[j] = 0;
		for(int k = 0; k < L; k++) {
			C[j] += (A[i * L + k] * B[k * N + j]);
		}
	}

}


void multStarPU_fullC(void *buffers[], void *cl_arg)
{

	struct starpu_vector_interface *vec_A = (struct starpu_vector_interface *) buffers[0];
	struct starpu_vector_interface *vec_B = (struct starpu_vector_interface *) buffers[1];
	struct starpu_vector_interface *vec_C = (struct starpu_vector_interface *) buffers[2];

	double *A = (double*)STARPU_VECTOR_GET_PTR(vec_A);
	double *B = (double*)STARPU_VECTOR_GET_PTR(vec_B);
	double *C = (double*)STARPU_VECTOR_GET_PTR(vec_C);

	int *args = (int *) cl_arg;
	int M = args[0];
	int N = args[1];
	int L = args[2];
	int i = args[3];

	for(int j = 0; j < N; j++) {
		C[i* N + j] = 0;
		for(int k = 0; k < L; k++) {
			C[i* N + j] += (A[i * L + k] * B[k * N + j]);
		}
	}

}

static struct starpu_perfmodel matrixmult_model =
{
	.type = STARPU_HISTORY_BASED,
	.symbol = "matrixmult"
};
static struct starpu_codelet cl_matrixmult =
{
	/* CPU implementation of the codelet */
	.cpu_funcs =
	{
			multStarPU
	},
	.cpu_funcs_name =
	{
		"matrixMult"
	},

	.nbuffers = 3,
	.modes = {STARPU_R, STARPU_R, STARPU_RW},
	.model = &matrixmult_model,
};



int main()
{
	int M = 2000;
	int N = 2000;
	int L = 2000;
	double* A = (double *) malloc(M*L*sizeof(double)); 
	double* B = (double *) malloc(N*L*sizeof(double)); 
	double* C = (double *) malloc(M*N*sizeof(double)); 

	double* C_ref = new double [M * N];

	for(int i = 0; i < M*N;i++) {
		A[i] = i%100;
		B[i] = i%100;
	}
	int ret = starpu_init(NULL);


	//This code doesn't work parallel because all task get full matrix C.
	//Kernel for this function is multStarPU_fullC

	// starpu_data_handle_t vector_A, vector_B, vector_C;
	// starpu_vector_data_register(&vector_A, STARPU_MAIN_RAM, (uintptr_t)A, M * L, sizeof(double));
	// starpu_vector_data_register(&vector_B, STARPU_MAIN_RAM, (uintptr_t)B, L * N, sizeof(double));
	// starpu_vector_data_register(&vector_C, STARPU_MAIN_RAM, (uintptr_t)C, M * N, sizeof(double));


	// for(int i = 0; i < M; i++) {
	// 	struct starpu_task *task_mult = starpu_task_create();

	// 	task_mult->cl = &cl_matrixmult;
	// 	int *mult_params = new int[4];
	// 	mult_params[0] = M;
	// 	mult_params[1] = N;
	// 	mult_params[2] = L;
	// 	mult_params[3] = i;
	// 	task_mult->handles[0] = vector_A;
	// 	task_mult->handles[1] = vector_B;
	// 	task_mult->handles[2] = vector_C;
	// 	task_mult->cl_arg = mult_params;
	// 	task_mult->cl_arg_size = sizeof(int);
	// 	ret = starpu_task_submit(task_mult);
	// }



	starpu_data_handle_t vector_A, vector_B;
	starpu_vector_data_register(&vector_A, STARPU_MAIN_RAM, (uintptr_t)A, M * L, sizeof(double));
	starpu_vector_data_register(&vector_B, STARPU_MAIN_RAM, (uintptr_t)B, L * N, sizeof(double));


	double start;
	double end;


	start = starpu_timing_now();


	//This code work parallel, because i split matrix C
	for(int i = 0; i < M; i++) {
		starpu_data_handle_t  vector_C;
		starpu_vector_data_register(&vector_C, STARPU_MAIN_RAM, (uintptr_t)(C + i * N), N, sizeof(double));
		struct starpu_task *task_mult = starpu_task_create();

		task_mult->cl = &cl_matrixmult;
		int *mult_params = new int[4];
		mult_params[0] = M;
		mult_params[1] = N;
		mult_params[2] = L;
		mult_params[3] = i;
		task_mult->handles[0] = vector_A;
		task_mult->handles[1] = vector_B;
		task_mult->handles[2] = vector_C;
		task_mult->cl_arg = mult_params;
		task_mult->cl_arg_size = sizeof(int);
		ret = starpu_task_submit(task_mult);
	}

	starpu_task_wait_for_all();

	end =  starpu_timing_now();

	double timing = end - start;
	printf("Total timing : %2.2lf ms\n", timing/1000);

	double start_ref;
	double end_ref;
	start_ref = starpu_timing_now();
	for(int i = 0; i < M; i++) {
		for(int j = 0; j < N; j++) {
			C_ref[i * N + j] = 0;
			for(int k = 0; k < L; k++) {
				C_ref[i * N + j] += (A[i * L + k] * B[k * N + j]);
			}
		}
	}

	end_ref =  starpu_timing_now();

	double timing_ref = end_ref - start_ref;
	printf("Total timing ref: %2.2lf ms\n", timing_ref/1000);

	double max_dif = -1000;
	for(int i = 0; i < M * N; i++) {
		if(fabs(C_ref[i] - C[i]) > max_dif) {
			max_dif = fabs(C_ref[i] - C[i]);
		}
	}

	printf("MAX_DIF = %lf \n", max_dif);
	return 0;
}



Archives gérées par MHonArc 2.6.19+.

Haut de le page