How do I call CUDA kernels from C++?

This article explains how to call CUDA kernels from C++ code, allowing for efficient parallel processing on NVIDIA GPUs.

CUDA, C++, GPU Programming, Parallel Computing, CUDA Kernels, NVIDIA, C++ CUDA Integration


#include <iostream>
#include <cuda_runtime.h>

// Kernel function to add two arrays
__global__ void addKernel(int *a, int *b, int *c, int N) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index < N) {
        c[index] = a[index] + b[index];
    }
}

int main() {
    const int ARRAY_SIZE = 5;
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
    
    // Host arrays
    int h_a[ARRAY_SIZE] = {1, 2, 3, 4, 5};
    int h_b[ARRAY_SIZE] = {10, 20, 30, 40, 50};
    int h_c[ARRAY_SIZE] = {0};

    // Device arrays
    int *d_a, *d_b, *d_c;

    // Allocate memory on the device
    cudaMalloc((void**)&d_a, ARRAY_BYTES);
    cudaMalloc((void**)&d_b, ARRAY_BYTES);
    cudaMalloc((void**)&d_c, ARRAY_BYTES);

    // Copy data from host to device
    cudaMemcpy(d_a, h_a, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, ARRAY_BYTES, cudaMemcpyHostToDevice);

    // Launch kernel with 1 block and ARRAY_SIZE threads
    addKernel<<(1, 1)>>(d_a, d_b, d_c, ARRAY_SIZE);

    // Copy result back to host
    cudaMemcpy(h_c, d_c, ARRAY_BYTES, cudaMemcpyDeviceToHost);

    // Display the result
    for (int i = 0; i < ARRAY_SIZE; i++) {
        std::cout << h_a[i] << " + " << h_b[i] << " = " << h_c[i] << std::endl;
    }

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}
    

CUDA C++ GPU Programming Parallel Computing CUDA Kernels NVIDIA C++ CUDA Integration