本文经过一个矩阵加法的例子来讲明如何使用网格和块来组织线程。web
一般状况下,一个矩阵用行优先的方法在全局内存中进行线性存储。以下图所示,这是一个8*6的矩阵。
在一个矩阵加法和核函数中,一个线程一般被分配一个数据元素来处理。首先要使用块和线程索引从全局内存中访问指定的数据。 接下来学习须要管理3种索引:数组
对于一个给定的线程, 首先能够经过把线程和块索引映射到矩阵坐标上来获取线程块和线程索引的全局内存偏移量, 而后将这些矩阵坐标映射到全局内存的存储单元中。
第一步, 能够用如下公式把线程和块索引映射到矩阵坐标上:svg
ix = threadIdx.x + blockDim.x*blockIdx.x; iy = threadIdx.y + blockDim.y*blockIdx.y;
第二步, 能够用如下公式把矩阵坐标映射到全局内存中的索引/存储单元上:函数
idx = iy * nx + ix;
以下图说明了块和线程索引、 矩阵坐标以及线性全局内存索引之间的对应关系。
学习
使用一个二维网格和二维块来编写一个矩阵加法核函数,代码以下:线程
#include <cuda_runtime.h> #include "device_launch_parameters.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <time.h> // 随机初始化数组 void initialInt(float *ip, float size) { for (int i = 0; i < size; i++) { ip[i] = (float)(rand() & 0xff) / 66.6; } } // 打印数组 void printMatrix(float *A, float *B, float *C, const int nx, const int ny) { float *ia = A, *ib = B, *ic = C; printf("\nMatrix:(%d, %d)\n", nx, ny); for (int iy = 0; iy < ny; iy++) { for (int ix = 0; ix < nx; ix++) { printf("%f + %f = %f ", ia[ix], ib[ix], ic[ix]); } ia += nx; ib += nx; ic += nx; printf("\n"); } printf("\n"); } // 验证结果 void printResult(float *C, float *CC, const int nx, const int ny) { float *ic = C, *icc = CC; for (int iy = 0; iy < ny; iy++) { for (int ix = 0; ix < nx; ix++) { printf("%f ", ic[ix]-icc[ix]); } ic += nx; icc += nx; printf("\n"); } printf("\n"); } // CPU:计算C=A+B void sumMatrixOnHost(float *A, float*B, float*C, const int nx, const int ny) { float *ia = A, *ib = B, *ic = C; for (int iy = 0; iy < ny; iy++) { for (int ix = 0; ix < nx; ix++) { ic[ix] = ia[ix] + ib[ix]; } ia += nx; ib += nx; ic += nx; } } // GPU:计算C=A+B __global__ void sumMatrixOnDevice(float *MatA, float *MatB, float *MatC, const int nx, const int ny) { int ix = threadIdx.x + blockDim.x*blockIdx.x; int iy = threadIdx.y + blockDim.y*blockIdx.y; unsigned int idx = iy * nx + ix; //unsigned int t_n = gridDim.x*blockDim.x + gridDim.y*blockDim.y; if (ix < nx && iy < ny) { MatC[idx] = MatA[idx] + MatB[idx]; //idx += t_n; // 一个块用屡次时 } } int main(int argc, char **argv) { //printf("%s Starting...\n", argv[10]); // get device information int dev = 0; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); printf("Using Device %d: %s\n\n", dev, deviceProp.name); // set matrix dimension int nx = 1<<14; int ny = 1<<14; int nxy = nx * ny; int nBytes = nxy * sizeof(float); // malloc host dimension float *h_A, *h_B, *h_C, *h_CC; h_A = (float *)malloc(nBytes); h_B = (float *)malloc(nBytes); h_C = (float *)malloc(nBytes); h_CC = (float *)malloc(nBytes); // initialize host matrix with integer initialInt(h_A, nxy); initialInt(h_B, nxy); // 开始计时 clock_t cpuStart = clock(); sumMatrixOnHost(h_A, h_B, h_C, nx, ny); // 结束计时 clock_t cpuEnd = clock(); float cpuTime = (float)(cpuEnd - cpuStart) / CLOCKS_PER_SEC; printf("cpu time:%f\n", cpuTime); //printMatrix(h_A, h_B, h_C, nx, ny); // mallox device memory float *d_MatA, *d_MatB, *d_MatC; cudaMalloc((void **)&d_MatA, nBytes); cudaMalloc((void **)&d_MatB, nBytes); cudaMalloc((void **)&d_MatC, nBytes); // 开始计时 clock_t gpuStart = clock(); // transfer data from host to device cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice); //set up execution configuration int dimx = 32; int dimy = 32; dim3 block(dimx, dimy); dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); //dim3 grid(2, 2); // invoke the kernal sumMatrixOnDevice << <grid, block >> > (d_MatA, d_MatB, d_MatC, nx, ny); cudaDeviceSynchronize(); // transfer data from device to host cudaMemcpy(h_CC, d_MatC, nBytes, cudaMemcpyDeviceToHost); // 结束计时 clock_t gpuEnd = clock(); float gpuTime = (float)(gpuEnd - gpuStart) / CLOCKS_PER_SEC; printf("gpu time:%f\n", gpuTime); //printMatrix(h_A, h_B, h_CC, nx, ny); //printResult(h_C, h_CC, nx, ny); // free host and device memory cudaFree(d_MatA); cudaFree(d_MatB); cudaFree(d_MatC); free(h_A); free(h_B); free(h_C); // reset device cudaDeviceReset(); return 0; }
运行结果以下:
code