Accéder au contenu.
Menu Sympa

starpu-devel - Re: [Starpu-devel] StarPU CUDA out of memory

Objet : Developers list for StarPU

Archives de la liste

Re: [Starpu-devel] StarPU CUDA out of memory


Chronologique Discussions 
  • From: Usman Dastgeer <usman.dastgeer@liu.se>
  • To: Samuel Thibault <samuel.thibault@ens-lyon.org>, <starpu-devel@lists.gforge.inria.fr>
  • Subject: Re: [Starpu-devel] StarPU CUDA out of memory
  • Date: Mon, 18 Feb 2013 10:22:07 +0100
  • List-archive: <http://lists.gforge.inria.fr/pipermail/starpu-devel>
  • List-id: "Developers list. For discussion of new features, code changes, etc." <starpu-devel.lists.gforge.inria.fr>

Thanks Samuel for your reply.

I am using both vector and matrix interface. Here is a simple program to reproduce the error.

regards,
Usman.

On 02/18/2013 09:54 AM, Samuel Thibault wrote:
Usman Dastgeer, le Mon 18 Feb 2013 09:44:02 +0100, a écrit :
I am getting "out of memory" error with CUDA fermi GPU when using StarPU.
How are you getting it exactly? Which data interface are you using?

With StarPU it seems that CUDA memory is not actually freed on
"data_unregister"
Indeed, StarPU uses an allocation cache. But it does memory reclaiming
when it gets an out of memory error, so it's not supposed to happen.

Samuel

Attachment: cuda_funcs.cu
Description: application/cu-seeme

#include <iostream>
#include <math.h>
#include <cstdlib>
#include <starpu.h>


/*!
 * Function to initialize atoms
 */

int initatoms(double *atombuf,int count,dim3 volsize,double gridspacing)
{
  srand(0);
  dim3 size;
  int i;
// compute grid dimensions in angstroms
  size.x = (gridspacing * volsize.x);
  size.y = (gridspacing * volsize.y);
  size.z = (gridspacing * volsize.z);
  for (i = 0; i < count; i += 4) {
    int addr = i;
    atombuf[addr] = (((rand()) / ((double )100)) * size.x);
    atombuf[addr + 1] = (((rand()) / ((double )100)) * size.y);
    atombuf[addr + 2] = (((rand()) / ((double )100)) * size.z);
// charge
    atombuf[addr + 3] = ((((rand()) / ((double )100)) * 2.0) - 1.0);
  }
  return 0;
}






typedef struct {
    struct starpu_codelet cl;
    struct starpu_perfmodel pm_history;
    int cl_init;
} cl_struct;


// this is direct version, internally allocates/deallocates memory...
extern void cuda_func_direct(double *atominfo, double *grid_in, double *energy_out, int atomsSize, int gridWidth, int gridHeight, float gridspacing);

extern void cuda_func(double* atominfo, double* grid_in, double* energy_out, int atomsSize, int gridWidth, int gridHeight, float gridspacing);

void cuda_func_wrapper(void *buffers[], void *args)
{
    cuda_func((double*)STARPU_VECTOR_GET_PTR(buffers[0]), 
	      (double*)STARPU_MATRIX_GET_PTR(buffers[1]), 
	      (double*)STARPU_MATRIX_GET_PTR(buffers[2]), 
	      STARPU_VECTOR_GET_NX(buffers[0]), 
	      STARPU_MATRIX_GET_NX(buffers[1]), 
	      STARPU_MATRIX_GET_NY(buffers[1]), 
	      *((float *)args));
}

void func_wrapper(double* atominfo, double* grid_in, double* energy_out, int atomsSize, int gridWidth, int gridHeight, float gridspacing)
{
    static cl_struct *objcl_struct;
    if( objcl_struct == NULL )
    {
        objcl_struct = (cl_struct*) malloc( sizeof( cl_struct ));
        memset( &(objcl_struct->cl), 0, sizeof(objcl_struct->cl));
        memset( &(objcl_struct->pm_history), 0, sizeof(objcl_struct->pm_history));
        objcl_struct->cl_init = 0;
    }
    if ( !(objcl_struct->cl_init) ) // codelete initialization only once, at first invocation
    {
        objcl_struct->cl.where = 0 | STARPU_CUDA ;
        objcl_struct->cl.cuda_funcs[0] = cuda_func_wrapper;
        objcl_struct->cl.cuda_funcs[1] = NULL;
	
        objcl_struct->cl.nbuffers = 3;
	
        objcl_struct->cl.modes[0] = STARPU_R;
        objcl_struct->cl.modes[1] = STARPU_R;
        objcl_struct->cl.modes[2] = STARPU_W;

	objcl_struct->pm_history.symbol = "pm_history";
        objcl_struct->pm_history.type = STARPU_HISTORY_BASED;
        objcl_struct->cl.model = &( objcl_struct->pm_history );
        objcl_struct->cl_init = 1;
    };    
    
    starpu_data_handle_t dh_atominfo;
    starpu_data_handle_t dh_grid_in;
    starpu_data_handle_t dh_energy_out;
    
    starpu_vector_data_register( &dh_atominfo,   0, ((unsigned long )atominfo),   atomsSize, sizeof(double) );
    
    starpu_matrix_data_register( &dh_grid_in,    0, ((unsigned long )grid_in),    gridWidth, gridWidth,     gridHeight,sizeof(double ) );
    starpu_matrix_data_register( &dh_energy_out, 0, ((unsigned long )energy_out), gridWidth, gridWidth,     gridHeight,sizeof(double ) );


    struct starpu_task *task = starpu_task_create();
    
    task->synchronous = 1;
    task->cl = &(objcl_struct->cl);
    task->handles[0] = dh_atominfo;
    task->handles[1] = dh_grid_in;
    task->handles[2] = dh_energy_out;
    
    task->cl_arg = &gridspacing;
    task->cl_arg_size = sizeof( float );
    
    int _ret_ = starpu_task_submit(task);
    if(_ret_ == -ENODEV)
    {
	fprintf(stderr, "ERROR: No worker may execute this task\n");
	return;
    }    
    
    
    starpu_data_unregister(dh_atominfo);
    starpu_data_unregister(dh_grid_in);
    starpu_data_unregister(dh_energy_out);
}




#define NB_ITERATIONS 15

int main(int argc,char **argv)
{
    const int matrixSize = 8192;
    int atomcount = 100;
    
    starpu_init(NULL);
    

    int atomsSize = (atomcount * 4);
    double *atoms = new double [atomsSize];
    double *grid_in = new double [(matrixSize * matrixSize)];
    double *energy_out = new double [(matrixSize * matrixSize)];
    
    dim3 volsize;
    volsize.x = 2048;
    volsize.y = 2048;
    volsize.z = 1;
    double gridspacing = 0.1;
    
    // allocate and initialize atom coordinates and charges
    if ((initatoms(atoms,(atomcount * 4),volsize,gridspacing))) 
	return -1;
  

    for(int i=0; i<NB_ITERATIONS; ++i)
    {
	// uses starpu, gives memory error....
        func_wrapper(atoms,grid_in,energy_out,atomsSize,matrixSize,matrixSize,gridspacing);
	
	// the following direct call works...
// 	cuda_func_direct(atoms,grid_in,energy_out,atomsSize,matrixSize,matrixSize,gridspacing);
    }
  
    delete []atoms;
    delete []grid_in;
    delete []energy_out;
  
    starpu_shutdown();
    
    return 0;
}


CFLAGS += $(shell pkg-config --cflags libstarpu) -O3
LDFLAGS += $(shell pkg-config --libs libstarpu)
NVCCFLAGS = -arch sm_20

all:main

main : main.o cuda_funcs.o
g++ main.o cuda_funcs.o $(CFLAGS) $(LDFLAGS) -fopenmp -o main

main.o : main.cpp
g++ main.cpp $(CFLAGS) -c -o main.o

cuda_funcs.o : cuda_funcs.cu
nvcc cuda_funcs.cu $(CFLAGS) -c -o cuda_funcs.o $(NVCCFLAGS)

clean:
rm -f main *.o *~



Archives gérées par MHonArc 2.6.19+.

Haut de le page