Accéder au contenu.
Menu Sympa

starpu-devel - Re: [Starpu-devel] Performance issues when specifying const and __attribute__ ((output))

Objet : Developers list for StarPU

Archives de la liste

Re: [Starpu-devel] Performance issues when specifying const and __attribute__ ((output))


Chronologique Discussions 
  • From: Mehdi AMINI <mehdi.amini@silkan.com>
  • To: Samuel Thibault <samuel.thibault@ens-lyon.org>, starpu-devel@lists.gforge.inria.fr
  • Subject: Re: [Starpu-devel] Performance issues when specifying const and __attribute__ ((output))
  • Date: Thu, 05 Jul 2012 17:05:30 +0200
  • List-archive: <http://lists.gforge.inria.fr/pipermail/starpu-devel>
  • List-id: "Developers list. For discussion of new features, code changes, etc." <starpu-devel.lists.gforge.inria.fr>

Hi all,

Le 02/07/2012 19:51, Samuel Thibault a écrit :
Mehdi AMINI, le Mon 02 Jul 2012 14:26:05 +0200, a écrit :
Le 02/07/2012 13:52, Samuel Thibault a écrit :
Mehdi AMINI, le Mon 02 Jul 2012 12:10:19 +0200, a écrit :
$ STARPU_NCUDA=1 STARPU_NCPUS=0 ./bicg_starpu_naive
192.2
$ STARPU_NCUDA=1 STARPU_NCPUS=0 ./bicg_starpu
746.5

Ouch!

Could you dump traces so we have an idea of what is happening?


Are the FXT traces enough? You'll find them here for both versions :

http://mehdi.amini.fr/StarPU/

So it seems it spends almost all its time in PushingOutput, which is not
supposed to happen. Could you perhaps run in gdb, interrupt it during
the run, and use

thread apply all bt

to have an idea at where it most often is? Or alternatively, compile
with gprof, but that might be more involved.

Would it be possible to see your code? It is hard to have an idea
without being able to run it.



The code is enclosed, it runs 10 times each version.

$ STARPU_NOPENCL=0 STARPU_NCUDA=1 STARPU_NCPUS=0 ./bicg_starpu
Run naive...
223.6
Run 'optimized'...
749.1
Run naive...
222.1
Run 'optimized'...
739.8
Run naive...
223.0
Run 'optimized'...
...



Mehdi





/*
 * file for bicg.c
 */
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <math.h>



/* Timer code (gettimeofday). */
static double t_start, t_end;

double timer_get_time()
{
    struct timeval t;
    if (gettimeofday (&t, NULL) != 0) {
      perror("Error gettimeofday !\n");
      exit(1);
    }
    return (t.tv_sec + t.tv_usec * 1.0e-6);
}


void timer_start() {
  t_start = timer_get_time();
}

void timer_stop() {
  t_end = timer_get_time();
}

void timer_display() {
  printf ("%0.1lf\n", (t_end - t_start)*1000);
}

void timer_stop_display() {
  timer_stop();
  timer_display();
}


void p4a_launcher_main(__attribute__((output)) double s[8000], int ny) __attribute__((task));
void p4a_launcher_main_accel(__attribute__((output)) double s[8000], int ny) __attribute__ ((task_implementation ("cuda", p4a_launcher_main)));
void p4a_launcher_main(__attribute__((output)) double s[8000], int ny) { fprintf(stderr,"CPU stub\n"); /*exit(1);*/ } 

void p4a_launcher_main_1(const double *A, const double p[8000], __attribute__((output)) double q[8000], int nx, int ny) __attribute__((task));
void p4a_launcher_main_1_accel(const double *A, const double p[8000], __attribute__((output)) double q[8000], int nx, int ny) __attribute__ ((task_implementation ("cuda", p4a_launcher_main_1)));
void p4a_launcher_main_1(const double *A, const double p[8000], __attribute__((output)) double q[8000], int nx, int ny) { fprintf(stderr,"CPU stub\n"); /*exit(1);*/ } 

void p4a_launcher_main_2(const double *A, double s[8000], int i, int ny, double r_0) __attribute__((task));
void p4a_launcher_main_2_accel(const double *A, double s[8000], int i, int ny, double r_0) __attribute__ ((task_implementation ("cuda", p4a_launcher_main_2)));
void p4a_launcher_main_2(const double *A, double s[8000], int i, int ny, double r_0) { fprintf(stderr,"CPU stub\n"); /*exit(1);*/ } 


void naive_p4a_launcher_main(double s[8000], int ny) __attribute__((task));
void naive_p4a_launcher_main_accel(double s[8000], int ny) __attribute__ ((task_implementation ("cuda", naive_p4a_launcher_main)));
void naive_p4a_launcher_main(double s[8000], int ny) { fprintf(stderr,"CPU stub\n"); /*exit(1);*/ } 

void naive_p4a_launcher_main_1(double *A, double p[8000], double q[8000], int nx, int ny) __attribute__((task));
void naive_p4a_launcher_main_1_accel(double *A, double p[8000], double q[8000], int nx, int ny) __attribute__ ((task_implementation ("cuda", naive_p4a_launcher_main_1)));
void naive_p4a_launcher_main_1(double *A, double p[8000], double q[8000], int nx, int ny) { fprintf(stderr,"CPU stub\n"); /*exit(1);*/ } 

void naive_p4a_launcher_main_2(double *A, double s[8000], int i, int ny, double r_0) __attribute__((task));
void naive_p4a_launcher_main_2_accel(double *A, double s[8000], int i, int ny, double r_0) __attribute__ ((task_implementation ("cuda", naive_p4a_launcher_main_2)));
void naive_p4a_launcher_main_2(double *A, double s[8000], int i, int ny, double r_0) { fprintf(stderr,"CPU stub\n"); /*exit(1);*/ } 




static void init_array(double A[8000][8000],double r[8000],double p[8000])
{
   int i, j;
   for(i = 0; i <= 7999; i += 1) {
      r[i] = i*3.14159265358979323846;
      p[i] = i*3.14159265358979323846;
      for(j = 0; j <= 7999; j += 1)
         A[i][j] = (double) i*j/8000;
   }
}
int main(int argc, char **argv)
{
   int i;
   int nx = 8000;
   int ny = 8000;

#pragma starpu initialize

   double A[8000][8000];
   double r[8000];
   double s[8000];
   double p[8000];
   double q[8000];

#pragma starpu register A
#pragma starpu register r
#pragma starpu register s
#pragma starpu register p
#pragma starpu register q

   int nruns;
   for(nruns=0;nruns<10;nruns++) {

     printf("Run naive...\n");
     
     /* Initialize array. */
#pragma starpu acquire A
#pragma starpu acquire r
#pragma starpu acquire q
     init_array(A,r,p);
#pragma starpu release A
#pragma starpu release r
#pragma starpu release q
 
     /* Start timer. */
     timer_start();
     naive_p4a_launcher_main(s, ny);
     naive_p4a_launcher_main_1((double *)A, p, q, nx, ny);
     for(i = 0; i <= nx-1; i += 1) {
      double r_0 = r[i];
      naive_p4a_launcher_main_2((double *)A, s, i, ny, r_0);
     }
     /* Stop and print timer. */
#pragma starpu acquire s
#pragma starpu acquire q
     timer_stop_display();
#pragma starpu release s
#pragma starpu release q


     printf("Run 'optimized'...\n");

     /* Initialize array. */
#pragma starpu acquire A
#pragma starpu acquire r
#pragma starpu acquire q
     init_array(A,r,p);
#pragma starpu release A
#pragma starpu release r
#pragma starpu release q


     /* Start timer. */
     timer_start();
     p4a_launcher_main(s, ny);
     p4a_launcher_main_1((double *)A, p, q, nx, ny);
     for(i = 0; i <= nx-1; i += 1) {
      double r_0 = r[i];
      p4a_launcher_main_2((double *)A, s, i, ny, r_0);
     }
     /* Stop and print timer. */
#pragma starpu acquire s
#pragma starpu acquire q
     timer_stop_display();
#pragma starpu release s
#pragma starpu release q
   }
   
#pragma starpu unregister s
#pragma starpu unregister q
#pragma starpu unregister A
#pragma starpu unregister r
#pragma starpu unregister p 
#pragma starpu shutdown
   return 0;

}

Attachment: compile.sh
Description: application/shellscript

Attachment: kernels.cu
Description: application/cu-seeme




Archives gérées par MHonArc 2.6.19+.

Haut de le page