Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

CUDA: compilation of LLVM IR using NVPTX

Tags:

cuda

llvm

ptx

For my project, I am generating PTX instructions for some functions in two different ways. The first method uses CUDA C to implement the functions and nvcc to compile them, using nvcc -ptx <file>.cu -o <file>.ptx. The other method writes code in different language, generates LLVM IR from this and compiles that to ptx using the NVPTX backend. The problem that I encounter here is that some functions perform worse in the second situation. Other functions result in more or less comparable performance.

Now I want to know why there is such a difference in performance for some functions (and why there isn't any for the others), but profiling using nsight hasn't given me any good idea yet.

The only difference I have found up until is the register usage. In the resulting ptx code I can see the following:

compiled using nvcc

.reg .u32 %r<8>;
.reg .u64 %rd<17>;
.reg .f32 %f<8>;
.reg .pred %p<5>;

compiled using nvptx

.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f64 %fl<396>;

As far as I understand, this signifies the number and type of used virtual registers, but as you can clearly see, this isn't correct in the second case. After profiling with nsight, I can see that the number of actually used registers/thread is 8 in the first case, and 31 in the second case. Of course, this might be an indication of why the code in the second case is slower, but the problem is that all my functions that are compiled from LLVM IR to ptx using NVPTX have this problem. They all have 396 used virtual registers and nsight reports 31 used registers/thread for all of them, even though some functions yield almost exactly the same performance as in the first case.

Is this register the problem of my slowdown? And why doesn't it affect all functions? If it's not, what might be causing the slowdown? Can you give any tips in what direction I should be looking?

Thanks!

(Version of LLVM used is 3.3)

EDIT: another difference I have noticed is the stall reasons:

NVCC:

Stall reasons for nvcc compiled code

NVPTX:

Stall reasons for nvptx compiled code

Apparently, there is a relative increase for the "other" reasons. Perhaps this might explain the problem?

EDIT: added ptx source code

The function that is shown here copies data from global memory to shared memory. Then each thread compares its own element and the previous element with the last element in the array. If the comparison is positive, the index is written to the output array.

1) LLVM IR compiled to PTX using NVPTX

// .globl   julia_cuda_find_weighted_median18585
.entry julia_cuda_find_weighted_median18585(
    .param .u64 .ptr .global .align 4 julia_cuda_find_weighted_median18585_param_0,
    .param .u64 .ptr .global .align 4 julia_cuda_find_weighted_median18585_param_1
)                                       // @julia_cuda_find_weighted_median18585
{
    .reg .pred %p<396>;
    .reg .s16 %rc<396>;
    .reg .s16 %rs<396>;
    .reg .s32 %r<396>;
    .reg .s64 %rl<396>;
    .reg .f64 %fl<396>;

// BB#0:                                // %top
    mov.u32     %r0, %tid.x;
    cvt.s64.s32     %rl4, %r0;
    mov.u32     %r0, %ctaid.x;
    cvt.s64.s32     %rl0, %r0;
    mov.u32     %r1, %ntid.x;
    cvt.s64.s32     %rl3, %r1;
    mad.lo.s64  %rl5, %rl3, %rl0, %rl4;
    setp.gt.s64     %p0, %rl5, -1;
    @%p0 bra    BB9_2;
    bra.uni     BB9_1;
BB9_2:                                  // %idxend
    ld.param.u64    %rl6, [julia_cuda_find_weighted_median18585_param_0];
    ld.param.u64    %rl2, [julia_cuda_find_weighted_median18585_param_1];
    add.s64     %rl1, %rl4, 1;
    mov.u64     %rl7, shmem1;
    cvta.shared.u64     %rl7, %rl7;
    shl.b64     %rl5, %rl5, 2;
    add.s64     %rl5, %rl6, %rl5;
    ld.global.f32   %f0, [%rl5];
    cvta.to.shared.u64  %rl5, %rl7;
    shl.b64     %rl6, %rl4, 2;
    add.s64     %rl4, %rl5, %rl6;
    st.shared.f32   [%rl4], %f0;
    bar.sync    0;
    setp.lt.s64     %p0, %rl1, 2;
    @%p0 bra    BB9_8;
// BB#3:                                // %if
    shl.b64     %rl3, %rl3, 2;
    add.s64     %rl3, %rl3, %rl5;
    ld.shared.f32   %f0, [%rl3+-4];
    cvt.f64.f32     %fl0, %f0;
    mul.f64     %fl0, %fl0, 0d3FE0000000000000;
    add.s64     %rl3, %rl6, %rl5;
    ld.shared.f32   %f0, [%rl3+-4];
    cvt.f64.f32     %fl1, %f0;
    setp.geu.f64    %p0, %fl1, %fl0;
    @%p0 bra    BB9_8;
// BB#4:                                // %L2
    ld.shared.f32   %f0, [%rl4];
    cvt.f64.f32     %fl1, %f0;
    setp.gtu.f64    %p0, %fl0, %fl1;
    @%p0 bra    BB9_8;
// BB#5:                                // %if3
    setp.gt.s32     %p0, %r0, -1;
    @%p0 bra    BB9_7;
    bra.uni     BB9_6;
BB9_7:                                  // %idxend5
    shl.b64     %rl0, %rl0, 2;
    add.s64     %rl0, %rl2, %rl0;
    st.global.u32   [%rl0], %rl1;
BB9_8:                                  // %L6
    ret;
BB9_1:                                  // %oob
    mov.u64     %rl0, cu_oob;
    // Callseq Start 26
    {
    .reg .b32 temp_param_reg;
    // <end>}
    .param .b64 param0;
    st.param.b64    [param0+0], %rl0;
    .param .b64 param1;
    st.param.b64    [param1+0], %rl0;
    .param .b32 retval0;
    call.uni (retval0), 
    vprintf, 
    (
    param0, 
    param1
    );
    ld.param.b32    %r0, [retval0+0];

    //{
    }// Callseq End 26
    ret;
BB9_6:                                  // %oob4
    mov.u64     %rl0, cu_oob;
    // Callseq Start 27
    {
    .reg .b32 temp_param_reg;
    // <end>}
    .param .b64 param0;
    st.param.b64    [param0+0], %rl0;
    .param .b64 param1;
    st.param.b64    [param1+0], %rl0;
    .param .b32 retval0;
    call.uni (retval0), 
    vprintf, 
    (
    param0, 
    param1
    );
    ld.param.b32    %r0, [retval0+0];

    //{
    }// Callseq End 27
    ret;
}

2) CUDA C compiled to PTX using nvcc

.entry findWeightedMedian_kernel (
        .param .u64 __cudaparm_findWeightedMedian_kernel_input,
        .param .u64 __cudaparm_findWeightedMedian_kernel_prescan,
        .param .u64 __cudaparm_findWeightedMedian_kernel_output)
    {
    .reg .u32 %r<8>;
    .reg .u64 %rd<17>;
    .reg .f32 %f<8>;
    .reg .pred %p<5>;
    .loc    4   93  0
$LDWbegin_findWeightedMedian_kernel:
    mov.u64     %rd1, temp;
    .loc    4   103 0
    cvt.s32.u16     %r1, %tid.y;
    cvt.s64.s32     %rd2, %r1;
    mul.wide.s32    %rd3, %r1, 4;
    add.u64     %rd4, %rd1, %rd3;
    cvt.s32.u16     %r2, %ntid.y;
    cvt.s32.u16     %r3, %ctaid.x;
    ld.param.u64    %rd5, [__cudaparm_findWeightedMedian_kernel_prescan];
    mul.lo.s32  %r4, %r2, %r3;
    add.s32     %r5, %r1, %r4;
    cvt.s64.s32     %rd6, %r5;
    mul.wide.s32    %rd7, %r5, 4;
    add.u64     %rd8, %rd5, %rd7;
    ld.global.f32   %f1, [%rd8+0];
    st.shared.f32   [%rd4+0], %f1;
    .loc    4   104 0
    bar.sync    0;
    mov.u32     %r6, 0;
    setp.le.s32     %p1, %r1, %r6;
    @%p1 bra    $Lt_1_3074;
    .loc    4   107 0
    cvt.s64.s32     %rd9, %r2;
    mul.wide.s32    %rd10, %r2, 4;
    add.u64     %rd11, %rd1, %rd10;
    ld.shared.f32   %f2, [%rd11+-4];
    mov.f32     %f3, 0f3f000000;        // 0.5
    mul.f32     %f4, %f2, %f3;
    ld.shared.f32   %f5, [%rd4+-4];
    setp.lt.f32     %p2, %f5, %f4;
    @!%p2 bra   $Lt_1_3074;
    ld.shared.f32   %f6, [%rd4+0];
    setp.ge.f32     %p3, %f6, %f4;
    @!%p3 bra   $Lt_1_3074;
    .loc    4   109 0
    ld.param.u64    %rd12, [__cudaparm_findWeightedMedian_kernel_output];
    cvt.s64.s32     %rd13, %r3;
    mul.wide.s32    %rd14, %r3, 4;
    add.u64     %rd15, %rd12, %rd14;
    st.global.s32   [%rd15+0], %r1;
$Lt_1_3074:
$L_1_2050:
$Lt_1_2562:
    .loc    4   111 0
    exit;
$LDWend_findWeightedMedian_kernel:
    } // findWeightedMedian_kernel
like image 705
PieterV Avatar asked May 26 '14 15:05

PieterV


1 Answers

I think I have found the reason for the slowdown, or at least a major part of it (about 76%). The typesystem in my custom toolchain automatically uses 64bit for literal values in the code (based on the architecture of the CPU). This leads to unnecessary 64bit calculations, which don't appear in the CUDA C.

like image 197
PieterV Avatar answered Sep 30 '22 14:09

PieterV