This work is licensed under a Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License
Computer Graphics
CUDA Matrix Multiplication
Mike Bailey
Copyright By Assignmentchef assignmentchef
cudaMatrixMult.pptx
mjb May 4, 2021
#ifndef NUMT #define NUMT #endif
Anatomy of the CUDA matrixMult Program: 2 #defines, #includes, and Globals
#include
#include
#ifndef MATRIX_SIZE #define MATRIX_SIZE #endif
#define AROWS #define ACOLS
#define BROWS #define BCOLS
#define ACOLSBROWS #define CROWS #define CCOLS
MATRIX_SIZE MATRIX_SIZE
MATRIX_SIZE MATRIX_SIZE
ACOLS AROWS BCOLS
// better be the same!
float hA[AROWS][ACOLS];
float hB[BROWS][BCOLS];
float hC[CROWS][CCOLS];
Computer Graphics
mjb May 4, 2021
Anatomy of a CUDA Program: 3 Error-Checking
void CudaCheckError( ) {
cudaError_t e = cudaGetLastError( ); if( e != cudaSuccess )
fprintf( stderr, CUDA failure %s:%d: %s
, __FILE__, __LINE__, cudaGetErrorString(e)); }
Computer Graphics
mjb May 4, 2021
int crow = gid / CCOLS; int ccol = gid % CCOLS;
int aindex = crow * ACOLS;
int bindex = ccol;
int cindex = crow * CCOLS + ccol;
// a[i][0] // b[0][j] // c[i][j]
float cij = 0.;
for( int k = 0; k < ACOLSBROWS; k++ ) {cij += A[aindex] * B[bindex]; aindex++;bindex += BCOLS;C[cindex] = cij;// __syncthreads( );Computer GraphicsAnatomy of a CUDA Program: 4 The Kernel Function__global__ void MatrixMul( float *A, float *B, float *C ) {// [A] is AROWS x ACOLS// [B] is BROWS x BCOLS// [C] is CROWS x CCOLS = AROWS x BCOLSint blockNum = blockIdx.y*gridDim.x + blockIdx.x;int blockThreads = blockNum*blockDim.x*blockDim.y;int gid = blockThreads + threadIdx.y*blockDim.x + threadIdx.x; mjb May 4, 2021Anatomy of a CUDA Program: 5 Setting Up the Memory for the Matrices This is a defined constant in one of the CUDA .h filesIn cudaMemcpy( ), its always the second argument getting copied to the first!Computer Graphicsmjb May 4, 2021// allocate device memory:float *dA, *dB, *dC;cudaMalloc( (void **)(&dA), sizeof(hA) ); cudaMalloc( (void **)(&dB), sizeof(hB) ); cudaMalloc( (void **)(&dC), sizeof(hC) ); CudaCheckError( );// copy host memory to device memory:cudaMemcpy( dA, hA, sizeof(hA), cudaMemcpyHostToDevice ); cudaMemcpy( dB, hB, sizeof(hB), cudaMemcpyHostToDevice ); Anatomy of a CUDA Program: 6 Getting Ready to Execute // setup execution parameters: dim3 threads( NUMT, NUMT, 1 ); if( threads.x > CROWS )
threads.x = CROWS; if( threads.y > CCOLS )
threads.y = CCOLS;
dim3 grid( CROWS / threads.x, CCOLS / threads.y );
// create cuda events for timing: cudaEvent_t start, stop; cudaEventCreate( &start ); cudaEventCreate( &stop ); CudaCheckError( );
// record the start event: cudaEventRecord( start, NULL );
Computer Graphics
mjb May 4, 2021
Anatomy of a CUDA Program: 7 Executing the Kernel
// execute the kernel:
MatrixMul<<< grid, threads >>>( dA, dB, dC );
Function call arguments # of blocks # of threads per block
The call to MatrixMul( ) returns immediately!
If you upload the resulting array (dC) right away, it will have garbage in
To block until the kernel is finished, call:
cudaDeviceSynchronize( );
Computer Graphics
mjb May 4, 2021
Anatomy of a CUDA Program: 8 Getting the Stop Time and Printing Performance
cudaDeviceSynchronize( );
// record the stop event: cudaEventRecord( stop, NULL );
// wait for the stop event to complete: cudaEventSynchronize( stop );
float msecTotal;
cudaEventElapsedTime( &millisecsTotal, start, stop );
// performance in multiplies per second:
// note: this in milliseconds
double secondsTotal = millisecsTotal / 1000.0; // change it to seconds
double multipliesTotal = (double)CROWS * (double)CCOLS * (double)ACOLSBROWS; double gigaMultipliesPerSecond = ( multipliesTotal / 1000000000. ) / secondsTotal; fprintf( stderr, %6dt%6dt%10.3lf
, CROWS, CCOLS, gigaMultipliesPerSecond );
Computer Graphics
mjb May 4, 2021
Anatomy of a CUDA Program:
Copying the Matrix from the Device back to the Host
cudaMemcpy( hC, dC ,sizeof(hC), cudaMemcpyDeviceToHost ); CudaCheckError( );
// clean up: cudaFree( dA ); cudaFree( dB ); cudaFree( dC ); CudaCheckError( );
This is a defined constant in one of the CUDA .h files
In cudaMemcpy( ), its always the second argument getting copied to the first!
Computer Graphics
mjb May 4, 2021
CS: assignmentchef QQ: 1823890830 Email: [email protected]
Reviews
There are no reviews yet.