Procedure
- Set Up CUDA on Your System
- Ensure you have a compatible NVIDIA GPU installed.
- Download and install CUDA Toolkit from NVIDIA’s website.
- Follow the installation prompts for your operating system.
- Verify the installation by running the command:
This should return the CUDA compiler version.nvcc --version
- Verify GPU Availability
- Open a Command Prompt or Terminal and run the following:
This will display the GPU status, driver version, memory usage, and utilization.nvidia-smi
- Open a Command Prompt or Terminal and run the following:
- Create a CUDA Program to Test Performance
- Write a simple CUDA program to perform matrix multiplication. Example:
#include <iostream> #include <cuda_runtime.h> __global__ void matMul(float* A, float* B, float* C, int N) { int row = threadIdx.y + blockIdx.y * blockDim.y; int col = threadIdx.x + blockIdx.x * blockDim.x; if (row < N && col < N) { float value = 0; for (int k = 0; k < N; ++k) { value += A[row * N + k] * B[k * N + col]; } C[row * N + col] = value; } } int main() { int N = 512; size_t size = N * N * sizeof(float); float *h_A, *h_B, *h_C; float *d_A, *d_B, *d_C; h_A = (float*)malloc(size); h_B = (float*)malloc(size); h_C = (float*)malloc(size); cudaMalloc(&d_A, size); cudaMalloc(&d_B, size); cudaMalloc(&d_C, size); for (int i = 0; i < N * N; ++i) { h_A[i] = 1.0f; h_B[i] = 1.0f; } cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); dim3 threadsPerBlock(16, 16); dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y); matMul<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N); cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); std::cout << "Matrix C: " << h_C[0] << std::endl; cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); free(h_A); free(h_B); free(h_C); return 0; } - Compile the program:
nvcc -o matmul matmul.cu - Run the program:
./matmul - Benchmark GPGPU Performance
- Use
nvidia-smior CUDA Profiler (nvprof) to monitor utilization during execution. - Test with different matrix sizes and compare speedup vs. CPU.
- Use
- Analyze Results
- Compare execution time for CPU and GPU.
- Measure memory usage, time, and efficiency using profiling tools.
- Document findings, speedup, and bottlenecks.