<<< >>> cuda in vscode

Are there any way to suppress "<<< >>>" error with vscode-cpptools.

I associate "*.cu" with "cpp" in setting.json.

// use normal c++ syntax highlighting for CUDA files
  "files.associations": {"*.cu": "cpp"},

and work fine except of one problem, kernel execution configuration parameters surrounded by <<< and >>> mistaked as error expected an expression

dim3 dimGrid(2, 2, 1);
dim3 dimBlock(width / 2, width / 2, 1);
MatrixMulKernel<<<dimGrid, dimBlock>>>(d_M, d_N, d_P, width);

Any suggestion

2 Answers

You can just download the vscode-cudacpp extention and than in your workspace(<>.workspace) or user settings(.vscode/settings.json) enable this option:

"settings": {
    "files.associations": {
        "*.cu": "cuda",
        "*.cuh": "cuda"
googling for a few hours, find no perfect solution but some workaround.

I summarize here:

  • use normal c++ syntax highlighting for CUDA files by edittingsetting.json
  • include necessary header of CUDA in program
  • include dummy header to workaround INTELLISENSE

Bellow is a concrete example

  • setting.json
"files.associations": {
    "*.cu": "cpp",
    "*.cuh": "cpp"
  • cudaDmy.cuh
#pragma once
void __syncthreads();  // workaround __syncthreads warning
#define KERNEL_ARG2(grid, block)
#define KERNEL_ARG3(grid, block, sh_mem)
#define KERNEL_ARG4(grid, block, sh_mem, stream)
#define KERNEL_ARG2(grid, block) <<< grid, block >>>
#define KERNEL_ARG3(grid, block, sh_mem) <<< grid, block, sh_mem >>>
#define KERNEL_ARG4(grid, block, sh_mem, stream) <<< grid, block, sh_mem,    
stream >>>
  • matrixMul.cu
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <device_functions.h>
#include <cuda_runtime_api.h>
#include "cudaDmy.cuh"

__global__ void MatrixMulKernel(float *M, float *N, float *P, int width)
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    int Col = blockIdx.x * blockDim.x + threadIdx.x;
    if (Row < width && Col < width)
        float Pvalue = 0;
        for (int i = 0; i < width; ++i)
            Pvalue += M[Row * width + i] * N[width * i + Col];
        P[Row * width + Col] = Pvalue;

void MatMul(float *M, float *N, float *P, int width)
    float *d_M;
    float *d_N;
    float *d_P;
    int size = width * width * sizeof(float);
    cudaMalloc((void **)&d_M, size);
    cudaMemcpy(d_M, M, size, cudaMemcpyHostToDevice);

    cudaMalloc((void **)&d_N, size);
    cudaMemcpy(d_N, N, size, cudaMemcpyHostToDevice);

    cudaMalloc((void **)&d_P, size);

    dim3 dimGrid(2, 2, 1);
    dim3 dimBlock(width / 2, width / 2, 1);
    // <<<>>> will replace macro KERNEL_ARG2 when compiling 
    MatrixMulKernel KERNEL_ARG2(dimGrid,dimBlock) (d_M, d_M, d_P, width);
    cudaMemcpy(P, d_P, size, cudaMemcpyDeviceToHost);

int main()
    int elem = 100;
    float *M = new float[elem];
    float *N = new float[elem];
    float *P = new float[elem];

    for (int i = 0; i < elem; ++i)
        M[i] = i;

    for (int i = 0; i < elem; ++i)
        N[i] = i + elem;

    time_t t1 = time(NULL);
    MatMul(M, N, P, sqrt(elem));
    time_t t2 = time(NULL);
    double seconds = difftime(t2,t1);
    printf ("%.3f seconds total time\n", seconds);
    for (int i = 0; i < elem/1000000; ++i)
        printf("%.1f\t", P[i]);
    delete[] M;
    delete[] N;
    delete[] P;
    return 0;

Let's compile it with NVCC

nvcc matrixMul.cu -Xcudafe "--diag_suppress=unrecognized_pragma" -o runcuda

useful links:

  • https://devtalk.nvidia.com/default/topic/513485/cuda-programming-and-performance/__syncthreads-is-undefined-need-a-help/post/5189004/#5189004
  • https://stackoverflow.com/a/6182137/8037585
  • https://stackoverflow.com/a/27992604/8037585
  • https://gist.github.com/ruofeidu/df95ba27dfc6b77121b27fd4a6483426
