Accéder au contenu.
Menu Sympa

starpu-devel - Re: [Starpu-devel] CUDA task and GCC plugin

Objet : Developers list for StarPU

Archives de la liste

Re: [Starpu-devel] CUDA task and GCC plugin


Chronologique Discussions 
  • From: Mehdi AMINI <mehdi.amini@silkan.com>
  • To: starpu-devel@lists.gforge.inria.fr
  • Subject: Re: [Starpu-devel] CUDA task and GCC plugin
  • Date: Mon, 11 Jun 2012 18:29:55 +0200
  • List-archive: <http://lists.gforge.inria.fr/pipermail/starpu-devel>
  • List-id: "Developers list. For discussion of new features, code changes, etc." <starpu-devel.lists.gforge.inria.fr>

Hi,

I also have an issue related to implicit cpu implementations and -O3
gcc flag which seems to be broken. I'm testing a simple task call with
a CPU and a CUDA implementation. Using -O3 the CPU task is always
executed while without the -O3 flag the CUDA task is correctly
executed. Is it a known issue ?

I believe there’s nothing wrong here: the default scheduler (‘eager’)
makes non-deterministic scheduling decisions, and just happened to favor
CPU over CUDA for some reason.


It seems to be reproducible, I mean 100%
Moreover StarPU was compiled with --disable-cpu ; and finally I even
reproduce it using STARPU_NCPUS=0 !

My guess is that -O3 may trigger an inlining of the original implicit
implementation at call site so that the StarPU wrapper is never called
and instead there directly the code of the CPU function.


I confirmed it studying the assembly generated for a simple example (see attached).

The simple printf in the implicit CPU implementation for the StarPU task is inlined and there not call anymore in the main function.

By the way it seems that there is another bug, the plugin segfaults when there is a task without any parameter.

Mehdi
#include <stdio.h>


static void launch(double *A) __attribute__ ((task));


static void launch(double *A) {
  printf("CPU");
}

extern void launch_cuda(double *A)
       __attribute__ ((task_implementation ("cuda", launch)));

int main(int argc, char **argv) {
#pragma starpu initialize
  double A[10];
  launch(A);
}
.file "test.c"
.section .rodata
.align 8
.LC0:
.string "test.c:7: error: attempt to use unregistered pointer\n"
.align 8
.LC1:
.string "test.c:7: error: failed to insert task `launch': %s\n"
.text
.type launch, @function
launch:
.LFB55:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movq %rdi, -24(%rbp)
movq -24(%rbp), %rax
movq %rax, %rdi
call starpu_data_lookup
movq %rax, -8(%rbp)
cmpq $0, -8(%rbp)
jne .L2
movl $.LC0, %edi
call puts
call abort
.L2:
movq -8(%rbp), %rax
movl $0, %ecx
movq %rax, %rdx
movl $3, %esi
movl $launch.codelet, %edi
movl $0, %eax
call starpu_insert_task
movl %eax, -12(%rbp)
cmpl $0, -12(%rbp)
je .L1
movl -12(%rbp), %eax
negl %eax
movl %eax, %edi
call strerror
movq %rax, %rsi
movl $.LC1, %edi
movl $0, %eax
call printf
call abort
.L1:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE55:
.size launch, .-launch
.section .rodata
.LC2:
.string "CPU"
.text
.type launch.cpu_implementation, @function
launch.cpu_implementation:
.LFB54:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movq %rdi, -8(%rbp)
movl $.LC2, %eax
movq %rax, %rdi
movl $0, %eax
call printf
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE54:
.size launch.cpu_implementation, .-launch.cpu_implementation
.section .rodata
.align 8
.LC3:
.string "test.c:15: error: failed to initialize StarPU: %s\n"
.text
.globl main
.type main, @function
main:
.LFB56:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $112, %rsp
movl %edi, -100(%rbp)
movq %rsi, -112(%rbp)
movl $0, %edi
call starpu_init
movl %eax, -4(%rbp)
cmpl $0, -4(%rbp)
je .L6
movl -4(%rbp), %eax
negl %eax
movl %eax, %edi
call strerror
movq %rax, %rsi
movl $.LC3, %edi
movl $0, %eax
call printf
call abort
.L6:
leaq -96(%rbp), %rax
movq %rax, %rdi
call launch
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE56:
.size main, .-main
.section .rodata
.LC4:
.string "launch"
.data
.align 32
.type launch.codelet, @object
.size launch.codelet, 856
launch.codelet:
.long 10
.zero 52
.quad launch.cpu_implementation.task_implementation_wrapper
.quad 0
.zero 16
.quad launch_cuda.task_implementation_wrapper
.quad 0
.zero 16
.quad 0
.zero 24
.zero 4
.long 1
.long 3
.zero 28
.zero 656
.quad .LC4
.text
.globl launch_cuda.task_implementation_wrapper
.type launch_cuda.task_implementation_wrapper, @function
launch_cuda.task_implementation_wrapper:
.LFB57:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq -24(%rbp), %rax
movq (%rax), %rax
movq (%rax), %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
movq %rax, %rdi
call launch_cuda
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE57:
.size launch_cuda.task_implementation_wrapper,
.-launch_cuda.task_implementation_wrapper
.type launch.cpu_implementation.task_implementation_wrapper,
@function
launch.cpu_implementation.task_implementation_wrapper:
.LFB58:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq -24(%rbp), %rax
movq (%rax), %rax
movq (%rax), %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
movq %rax, %rdi
call launch.cpu_implementation
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE58:
.size launch.cpu_implementation.task_implementation_wrapper,
.-launch.cpu_implementation.task_implementation_wrapper
.ident "GCC: (GNU) 4.6.3 20120306 (Red Hat 4.6.3-2)"
.section .note.GNU-stack,"",@progbits



Archives gérées par MHonArc 2.6.19+.

Haut de le page