CUDA实现矩阵加法

前言

本文经过一个矩阵加法的例子来讲明如何使用网格和块来组织线程。web

使用块和线程创建矩阵索引

一般状况下,一个矩阵用行优先的方法在全局内存中进行线性存储。以下图所示,这是一个8*6的矩阵。
8*6矩阵
在一个矩阵加法和核函数中,一个线程一般被分配一个数据元素来处理。首先要使用块和线程索引从全局内存中访问指定的数据。 接下来学习须要管理3种索引:数组

  1. 线程和块索引;
  2. 矩阵中给定点的坐标;
  3. 全局线性内存中的偏移量;

对于一个给定的线程, 首先能够经过把线程和块索引映射到矩阵坐标上来获取线程块和线程索引的全局内存偏移量, 而后将这些矩阵坐标映射到全局内存的存储单元中。
第一步, 能够用如下公式把线程和块索引映射到矩阵坐标上:svg

ix = threadIdx.x + blockDim.x*blockIdx.x;
	iy = threadIdx.y + blockDim.y*blockIdx.y;

第二步, 能够用如下公式把矩阵坐标映射到全局内存中的索引/存储单元上:函数

idx = iy * nx + ix;

以下图说明了块和线程索引、 矩阵坐标以及线性全局内存索引之间的对应关系。
全局线性内存索引:idx = iy*ny+ix学习

使用二维网格和二维块对矩阵求和

使用一个二维网格和二维块来编写一个矩阵加法核函数,代码以下:线程

#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

// 随机初始化数组
void initialInt(float *ip, float size)
{
	for (int i = 0; i < size; i++)
	{
		ip[i] = (float)(rand() & 0xff) / 66.6;
	}
}
// 打印数组
void printMatrix(float *A, float *B, float *C, const int nx, const int ny)
{
	float *ia = A, *ib = B, *ic = C;
	printf("\nMatrix:(%d, %d)\n", nx, ny);
	for (int iy = 0; iy < ny; iy++)
	{
		for (int ix = 0; ix < nx; ix++)
		{
			printf("%f + %f = %f     ", ia[ix], ib[ix], ic[ix]);
		}
		ia += nx;
		ib += nx;
		ic += nx;
		printf("\n");
	}
	printf("\n");
}
// 验证结果
void printResult(float *C, float *CC, const int nx, const int ny)
{
	float *ic = C, *icc = CC;
	for (int iy = 0; iy < ny; iy++)
	{
		for (int ix = 0; ix < nx; ix++)
		{
			printf("%f     ", ic[ix]-icc[ix]);
		}
		ic += nx;
		icc += nx;
		printf("\n");
	}
	printf("\n");
}
// CPU:计算C=A+B
void sumMatrixOnHost(float *A, float*B, float*C, const int nx, const int ny)
{
	float *ia = A, *ib = B, *ic = C;

	for (int iy = 0; iy < ny; iy++)
	{
		for (int ix = 0; ix < nx; ix++)
		{
			ic[ix] = ia[ix] + ib[ix];
		}
		ia += nx;
		ib += nx;
		ic += nx;
	}
}
// GPU:计算C=A+B
__global__ void sumMatrixOnDevice(float *MatA, float *MatB, float *MatC, const int nx, const int ny)
{
	int ix = threadIdx.x + blockDim.x*blockIdx.x;
	int iy = threadIdx.y + blockDim.y*blockIdx.y;
	unsigned int idx = iy * nx + ix;
	//unsigned int t_n = gridDim.x*blockDim.x + gridDim.y*blockDim.y;
	if (ix < nx && iy < ny)
	{
		MatC[idx] = MatA[idx] + MatB[idx];
		//idx += t_n; // 一个块用屡次时
	}
}


int main(int argc, char **argv)
{
	//printf("%s Starting...\n", argv[10]);
	
	// get device information
	int dev = 0;
	cudaDeviceProp deviceProp;
	cudaGetDeviceProperties(&deviceProp, dev);
	printf("Using Device  %d: %s\n\n", dev, deviceProp.name);

	// set matrix dimension
	int nx = 1<<14;
	int ny = 1<<14;
	int nxy = nx * ny;
	int nBytes = nxy * sizeof(float);

	// malloc host dimension
	float *h_A, *h_B, *h_C, *h_CC;
	h_A = (float *)malloc(nBytes);
	h_B = (float *)malloc(nBytes);
	h_C = (float *)malloc(nBytes);
	h_CC = (float *)malloc(nBytes);

	// initialize host matrix with integer
	initialInt(h_A, nxy);
	initialInt(h_B, nxy);
	
	// 开始计时
	clock_t cpuStart = clock();
	
	sumMatrixOnHost(h_A, h_B, h_C, nx, ny);

	// 结束计时
	clock_t cpuEnd = clock();
	float cpuTime = (float)(cpuEnd - cpuStart) / CLOCKS_PER_SEC;
	printf("cpu time:%f\n", cpuTime);
	
	//printMatrix(h_A, h_B, h_C, nx, ny);

	// mallox device memory
	float *d_MatA, *d_MatB, *d_MatC;
	cudaMalloc((void **)&d_MatA, nBytes);
	cudaMalloc((void **)&d_MatB, nBytes);
	cudaMalloc((void **)&d_MatC, nBytes);

	// 开始计时
	clock_t gpuStart = clock();

	// transfer data from host to device
	cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);

	//set up execution configuration
	int dimx = 32;
	int dimy = 32;
	dim3 block(dimx, dimy);
	dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
	//dim3 grid(2, 2);

	// invoke the kernal
	sumMatrixOnDevice << <grid, block >> > (d_MatA, d_MatB, d_MatC, nx, ny);
	cudaDeviceSynchronize();

	// transfer data from device to host
	cudaMemcpy(h_CC, d_MatC, nBytes, cudaMemcpyDeviceToHost);

	// 结束计时
	clock_t gpuEnd = clock();
	float gpuTime = (float)(gpuEnd - gpuStart) / CLOCKS_PER_SEC;
	printf("gpu time:%f\n", gpuTime);

	//printMatrix(h_A, h_B, h_CC, nx, ny);

	//printResult(h_C, h_CC, nx, ny);

	// free host and device memory
	cudaFree(d_MatA);
	cudaFree(d_MatB);
	cudaFree(d_MatC);
	free(h_A);
	free(h_B);
	free(h_C);

	// reset device
	cudaDeviceReset();

	return 0;
}

运行结果以下:
运行结果code