This article explains how to call CUDA kernels from C++ code, allowing for efficient parallel processing on NVIDIA GPUs.
CUDA, C++, GPU Programming, Parallel Computing, CUDA Kernels, NVIDIA, C++ CUDA Integration
#include <iostream>
#include <cuda_runtime.h>
// Kernel function to add two arrays
__global__ void addKernel(int *a, int *b, int *c, int N) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < N) {
c[index] = a[index] + b[index];
}
}
int main() {
const int ARRAY_SIZE = 5;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);
// Host arrays
int h_a[ARRAY_SIZE] = {1, 2, 3, 4, 5};
int h_b[ARRAY_SIZE] = {10, 20, 30, 40, 50};
int h_c[ARRAY_SIZE] = {0};
// Device arrays
int *d_a, *d_b, *d_c;
// Allocate memory on the device
cudaMalloc((void**)&d_a, ARRAY_BYTES);
cudaMalloc((void**)&d_b, ARRAY_BYTES);
cudaMalloc((void**)&d_c, ARRAY_BYTES);
// Copy data from host to device
cudaMemcpy(d_a, h_a, ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, ARRAY_BYTES, cudaMemcpyHostToDevice);
// Launch kernel with 1 block and ARRAY_SIZE threads
addKernel<<(1, 1)>>(d_a, d_b, d_c, ARRAY_SIZE);
// Copy result back to host
cudaMemcpy(h_c, d_c, ARRAY_BYTES, cudaMemcpyDeviceToHost);
// Display the result
for (int i = 0; i < ARRAY_SIZE; i++) {
std::cout << h_a[i] << " + " << h_b[i] << " = " << h_c[i] << std::endl;
}
// Free device memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
How do I avoid rehashing overhead with std::set in multithreaded code?
How do I find elements with custom comparators with std::set for embedded targets?
How do I erase elements while iterating with std::set for embedded targets?
How do I provide stable iteration order with std::unordered_map for large datasets?
How do I reserve capacity ahead of time with std::unordered_map for large datasets?
How do I erase elements while iterating with std::unordered_map in multithreaded code?
How do I provide stable iteration order with std::map for embedded targets?
How do I provide stable iteration order with std::map in multithreaded code?
How do I avoid rehashing overhead with std::map in performance-sensitive code?
How do I merge two containers efficiently with std::map for embedded targets?