CUDA编程

1.3 CUDA编程模型之分配并行Thread

2020-07-07  本文已影响0人  Catherin_gao

1. 如何用block和thread索引矩阵

1.1 矩阵在全局内存 (global memory) 中通过行优先的方式线程存储。

8*6的矩阵存储示意图

1.2 如何使用block和thread索引从global memory访问分配的数据是GPU编程需要解决的第一个问题

1.2.1 具体映射方法 (2D Grid and 2D Blocks)

对于给定的线程,从block和thread索引中获取global memory中的偏移量。大概分为两步:

(1) 将thread和block索引映射到矩阵中的坐标

ix = threadIdx.x + blockIdx.x * blockDim.x 
iy = threadIdx.y + blockIdx.y * blockDim.y

(2) 将这些矩阵坐标映射到global memory的位置

idx = iy * nx + ix
映射示意图

(3) 矩阵加法例子

__global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC, int nx, int ny) {
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; 
    unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y; 
    unsigned int idx = iy*nx + ix;

    if (ix < nx && iy < ny)
         MatC[idx] = MatA[idx] + MatB[idx];
}

2. 使用其他布局分配的矩阵加 (nx*ny) 学习如何使用GPU编程。

2.1 1D block 和1D grid

dim3 block(32,1);
dim3 grid((nx+block.x-1)/block.x,1);
__global__ void sumMatrixOnGPU1D(float *MatA, float *MatB, float *MatC, int nx, int ny) {
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; 
    if (ix < nx ) {
        for (int iy=0; iy<ny; iy++) {
            int idx = iy*nx + ix;
            MatC[idx] = MatA[idx] + MatB[idx];
        }
     }
}

2.2 2D Grid and 1D Blocks

dim3 block(32);
dim3 grid((nx + block.x - 1) / block.x, ny);
ix = threadIdx.x + blockIdx.x * blockDim.x; 
iy = blockIdx.y;
block index分配示意图
__global__ void sumMatrixOnGPUMix(float *MatA, float *MatB, float *MatC, int nx, int ny) {
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; 
    unsigned int iy = blockIdx.y;
    unsigned int idx = iy*nx + ix;

    if (ix < nx && iy < ny)
        MatC[idx] = MatA[idx] + MatB[idx];
}
上一篇 下一篇

猜你喜欢

热点阅读