CUDA C初学者编程（VS2017）

2018-10-24 本文已影响0人 Eden0503

打开VS2017后，文件——新建——项目；

找到NVIDIA，有的人说自己的VS中没看见NVIDIA这一项啊，那是因为没有你没有安装CUDA，或者你在安装CUDA的时候参照某教程将Visual Studio Integration 取消勾选安装，其实后来再重新装上就行。

创建一个文件夹名为 cuda_test 的项目，然后我们发现其实里面已经有 .cu 文件了，如下图所示。

然后，我们像C语言一样生成编译文件，最终结果如下：

接下来，我们修改代码如下，并运行以下代码。

#include <stdio.h>

const int N = 16;
const int blocksize = 16;

__global__  void hello(char *a, int *b)
{
    a[threadIdx.x] += b[threadIdx.x];
}

int main()
{
    char a[N] = "Hello \0\0\0\0\0\0";
    int b[N] = { 15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

    char *ad;
    int *bd;
    const int csize = N * sizeof(char);
    const int isize = N * sizeof(int);

    printf("%s", a);

    cudaMalloc((void**)&ad, csize);
    cudaMalloc((void**)&bd, isize);
    cudaMemcpy(ad, a, csize, cudaMemcpyHostToDevice);
    cudaMemcpy(bd, b, isize, cudaMemcpyHostToDevice);

    dim3 dimBlock(blocksize, 1);
    dim3 dimGrid(1, 1);
    hello << <dimGrid, dimBlock >> > (ad, bd);
    cudaMemcpy(a, ad, csize, cudaMemcpyDeviceToHost);
    cudaFree(ad);
    cudaFree(bd);

    printf("%s\n", a);
    return EXIT_SUCCESS;
}

编译运行成功，最终结果如下：

运行以下代码，查看电脑设备。

/*********************************************/

#include <stdio.h>
#include <cuda_runtime.h>

void printDeviceProp(const cudaDeviceProp &prop)
{
    printf("Device Name : %s.\n", prop.name);
    printf("totalGlobalMem : %d.\n", prop.totalGlobalMem);
    printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);
    printf("regsPerBlock : %d.\n", prop.regsPerBlock);
    printf("warpSize : %d.\n", prop.warpSize);
    printf("memPitch : %d.\n", prop.memPitch);
    printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
    printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
    printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
    printf("totalConstMem : %d.\n", prop.totalConstMem);
    printf("major.minor : %d.%d.\n", prop.major, prop.minor);
    printf("clockRate : %d.\n", prop.clockRate);   // 输出的是GPU的时钟频率
    printf("textureAlignment : %d.\n", prop.textureAlignment);
    printf("deviceOverlap : %d.\n", prop.deviceOverlap);
    printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

bool InitCUDA()
{
    //used to count the device numbers
    int count;

    // 获取CUDA设备数 
    cudaGetDeviceCount(&count);  
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }

    // find the device >= 1.X
    int i;
    for (i = 0; i < count; ++i) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {  //cudaGetDeviceProperties获取 CUDA 设备属性
            if (prop.major >= 1) {
                printDeviceProp(prop);
                break;
            }
        }
    }

    // if can't find the device
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }

    // set cuda device
    cudaSetDevice(i);

    return true;
}

int main(int argc, char const *argv[])
{
    if (InitCUDA()) {
        printf("CUDA initialized.\n");
    }

    return 0;
}

程序运行结果：

查看当前系统上安装的GPU设备的详细参数

打开文件夹C:\ProgramData\NVIDIA Corporation\CUDA Samples\v9.2\1_Utilities
其中有一个名为deviceQuery的程序，可编译执行，即可打出当前系统上安装的GPU设备的详细参数。结果如下：

CUDA 初始化

/*
CUDA初始化
*/

#include "cuda_runtime.h"
#include <stdio.h>

//CUDA 初始化
bool InitCUDA(){
    int count;
    cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    int i;
    for (i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                break;
            }
        }
    }
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }
    cudaSetDevice(i);
    return true;
}

int main(){ 
    if (!InitCUDA()) {
        return 0;
    }//CUDA 初始化
    printf("CUDA initialized.\n");
    return 0;
}

运行得到结果：

说明系统上有支持CUDA的装置。

完整的向量点积CUDA程序

/*
完整的向量点积CUDA程序
a=[a1,a2,…an], b=[b1,b2,…bn]
a*b=a1*b1+a2*b2+…+an*bn
*/
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <cuda.h> 
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#define N 10
__global__ void Dot(int *a, int *b, int *c) //声明kernel函数
{
    __shared__ int temp[N]; // 声明在共享存储中的变量
    temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
    __syncthreads();  // 此处有下划线，并不是错误，而是VS智能感知有问题，
    if (0 == threadIdx.x)
    {
        int sum = 0;
        for (int i = 0; i < N; i++)
            sum += temp[i];
        *c = sum;
        printf("sum Calculated on Device:%d\n", *c);
    }
}
void random_ints(int *a, int n)
{
    for (int i = 0; i < n; i++)
        *(a + i) = rand() % 10;
}
int main()
{
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    int size = N * sizeof(int);
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, sizeof(int));
    a = (int *)malloc(size); random_ints(a, N);
    b = (int *)malloc(size); random_ints(b, N);
    c = (int *)malloc(sizeof(int));
    printf("Array a[N]:\n");
    for (int i = 0; i < N; i++) printf("%d ", a[i]);
    printf("\n");
    printf("Array b[N]:\n");
    for (int i = 0; i < N; i++) printf("%d ", b[i]);
    printf("\n\n");
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
    Dot << <1, N >> > (d_a, d_b, d_c); //单block多thread
    cudaMemcpy(c, d_c, sizeof(int), cudaMemcpyDeviceToHost);
    int sumHost = 0;
    for (int i = 0; i < N; i++)
        sumHost += a[i] * b[i];
    printf("sum Calculated on Host=%d\n", sumHost);
    printf("Device to Host: a*b=%d\n", *c);
    free(a); free(b); free(c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}

运行结果如下：

计算随机数向量立方和的CUDA程序

/*
计算随机数向量立方和的CUDA程序
*/

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <cuda.h> 
#include <stdio.h>
#include <stdlib.h>  // 为了使用rand 函数
#include <malloc.h>
#define DATA_SIZE 1048576 //数据
int data[DATA_SIZE];
//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size)
{
    for (int i = 0; i < size; i++) {
        number[i] = rand() % 10;  // 0~32767之间的随机数 
    }
}

//CUDA 初始化
bool InitCUDA()
{
    int count;
    //取得支持Cuda的装置的数目
    cudaGetDeviceCount(&count);
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    int i;
    for (i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                break;
            }
        }
    }
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }
    cudaSetDevice(i);
    return true;
}
// __global__ 函数(GPU上执行) 计算立方和
__global__ static void sumOfcubes(int *num, int* result)
{
    int sum = 0;
    int i;
    for (i = 0; i < DATA_SIZE; i++) {
        sum += num[i] * num[i] * num[i];
    }
    *result = sum;
}
int main()
{ //CUDA 初始化
    if (!InitCUDA()) {
        return 0;
    }
    //生成随机数
    GenerateNumbers(data, DATA_SIZE);
    int* gpudata, *result;  // device中的参数
    int gpusum = 0; //CPU中的参数，但是是为了记录gpu中求到的sum.
    cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
    cudaMalloc((void**)&result, sizeof(int));

    
    cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中
    sumOfcubes <<<1,1,0>>> (gpudata, result); // gpudata, result 是传给设备本身的参数
    cudaMemcpy(&gpusum, result, sizeof(int), cudaMemcpyDeviceToHost);  //cudaMemcpy将GPU中的结果result 传给 CPU中的参数gpusum
    cudaFree(gpudata);
    cudaFree(result);
    printf("GPUsum: %d \n", gpusum);
    int sum = 0;
    for (int i = 0; i < DATA_SIZE; i++) {
        sum += data[i] * data[i] * data[i];
    }
    printf("CPUsum: %d \n", sum);
    return 0;
}

运行结果如下：

计算随机数向量立方和的CUDA程序（加入统计时间的函数）

/*
计算随机数向量立方和的CUDA程序
*/

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <time.h>  // 为了使用clock_t
#include <cuda.h> 
#include <stdio.h>
#include <stdlib.h>  // 为了使用rand 函数
#include <malloc.h>
#define DATA_SIZE 1048576 //数据
int data[DATA_SIZE];
//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size)
{
    for (int i = 0; i < size; i++) {
        number[i] = rand() % 10;  // 0~32767之间的随机数 
    }
}

//CUDA 初始化
bool InitCUDA()
{
    int count;
    //取得支持Cuda的装置的数目
    cudaGetDeviceCount(&count);
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    int i;
    for (i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                break;
            }
        }
    }
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }
    cudaSetDevice(i);
    return true;
}
// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int *result, clock_t *time)
{
    int sum = 0;
    int i;
    clock_t start = clock();
    for (i = 0; i < DATA_SIZE; i++) {
        sum += num[i] * num[i] * num[i];
    }
    *result = sum;
    *time = clock() - start;
}

int main()
{ //CUDA 初始化
    if (!InitCUDA()) {
        return 0;
    }
    //生成随机数
    GenerateNumbers(data, DATA_SIZE);
    int* gpudata, *result;  // device中的参数
    int gpusum = 0; //CPU中的参数，但是是为了记录gpu中求到的sum.

    clock_t* time;
    clock_t time_used;

    cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
    cudaMalloc((void**)&result, sizeof(int));
    cudaMalloc((void**)&time, sizeof(clock_t));
    
    cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中
    sumOfcubes <<<1,1,0>>> (gpudata, result,time); // gpudata, result,time 是传给设备本身的参数
    cudaMemcpy(&gpusum, result, sizeof(int), cudaMemcpyDeviceToHost);  //cudaMemcpy将GPU中的结果result 传给 CPU中的参数gpusum
    cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost); 
    
    cudaFree(gpudata);
    cudaFree(result);
    cudaFree(time);

    printf("GPUsum: %d——time:%d\n", gpusum, time_used);

    int sum = 0;
    for (int i = 0; i < DATA_SIZE; i++) {
        sum += data[i] * data[i] * data[i];
    }
    printf("CPUsum: %d \n", sum);
    return 0;
}

结果如下：

GTX 1050上运行核函数部分的时间约为：
1992949251/ 1493000 = 1334.8 mS 其中1493000是GPU频率。
重要！！！
也就是clock() 统计出来的 (end - start) / GPU频率 = 毫秒级单位的时间其中利用***获取的GPU频率单位是千赫兹，比如我的GTX1050 的频率是 1493000千赫兹。

进一步优化上述求立方和的程序(利用单block多thread)

/*
计算随机数向量立方和的CUDA程序
*/

#include "cuda_runtime.h"
#include <time.h>  // 为了使用clock_t
#include <stdio.h>
#include <stdlib.h>  // 为了使用rand 函数

//#include "device_launch_parameters.h"
//#include "device_functions.h"
//#include <cuda.h> 
//#include <malloc.h>

#define DATA_SIZE 1048576 //数据
#define THREAD_NUM 256  //线程的数量
int data[DATA_SIZE];

//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size){
    for (int i = 0; i < size; i++) {
        number[i] = rand() % 10;  // 0~32767之间的随机数 
    }
}

//CUDA 初始化
bool InitCUDA(){
    int count;
    //取得支持Cuda的装置的数目
    cudaGetDeviceCount(&count);
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    int i;
    for (i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                break;
            }
        }
    }
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }
    cudaSetDevice(i);
    return true;
}
// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int *result, clock_t *time){
    const int tid = threadIdx.x;
    const int size = DATA_SIZE / THREAD_NUM;//计算每个线程需要完成的量
    int sum = 0;
    int i;

    clock_t start; // 记录运算开始的时间 

    if (tid == 0) start = clock(); //只在thread 0（即threadIdx.x = 0 的时候）进行记录
    for (i = tid * size; i < (tid + 1) * size; i++){
        sum += num[i] * num[i] * num[i];
    }
    result[tid] = sum;
    //计算时间的动作，只在thread 0（即threadIdx.x = 0 的时候）进行
    if (tid == 0) *time = clock() - start;
}

int main(){ 
    if (!InitCUDA()) {return 0;}//CUDA 初始化
    GenerateNumbers(data, DATA_SIZE); //生成随机数
    int* gpudata, *result;  // device中的参数
    int gpusum[THREAD_NUM];  // CPU中记录GPU中记录求和的参数
    clock_t* time;
    clock_t time_used;

    cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
    cudaMalloc((void**)&time, sizeof(clock_t));
    cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM);

    //cudaMemcpy 将产生的随机数data从cpu复制到显卡内存中
    cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice); 
    // 启动kernel函数
    sumOfcubes<<<1,THREAD_NUM,0>>> (gpudata, result,time); // gpudata, result,time 是传给设备本身的参数

    clock_t time_use;
    
    //cudaMemcpy 将结果从显存中复制回CPU内存 gpusum
    cudaMemcpy(&gpusum, result, sizeof(int) * THREAD_NUM, cudaMemcpyDeviceToHost);
    cudaMemcpy(&time_use, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
    
    cudaFree(gpudata);
    cudaFree(result);
    cudaFree(time);

    int final_sum = 0; /*立方和归约*/
    for (int i = 0; i < THREAD_NUM; i++){
        final_sum += gpusum[i];
    }
    printf("GPUsum: %d gputime: %d\n", final_sum, time_use);

    // 计算CPU中计算时的结果。
    final_sum = 0;
    for (int i = 0; i < DATA_SIZE; i++) {
        final_sum += data[i] * data[i] * data[i];
    }
    printf("CPUsum: %d \n", final_sum);
    return 0;
}

结果如下：

然而，上述程序还能够优化，上述代码：

    if (tid == 0) start = clock(); //只在thread 0（即threadIdx.x = 0 的时候）进行记录
    for (i = tid * size; i < (tid + 1) * size; i++){
        sum += num[i] * num[i] * num[i];
    }
    result[tid] = sum;
    //计算时间的动作，只在thread 0（即threadIdx.x = 0 的时候）进行
    if (tid == 0) *time = clock() - start;
}

中核函数并不是连续存取的，读取数字完全是跳跃式的读取，这会非常影响内存的存取效率。因此我们下一步要将取数字的过程变成：thread 0 读取第一个数字，thread 1 读取第二个数字，以此类推......
程序部分修改为：

    if (tid == 0) start = clock(); //只在thread 0（即threadIdx.x = 0 的时候）进行记录
    for (i = tid; i < DATA_SIZE; i+= THREAD_NUM){
        sum += num[i] * num[i] * num[i];
    }
    result[tid] = sum;
    //计算时间的动作，只在thread 0（即threadIdx.x = 0 的时候）进行
    if (tid == 0) *time = clock() - start;

实验运行结果：

？？？问题来了，老师上课的时候明明说，这样的优化会使得程序运行时间缩短很多倍，但是为啥我的运行时间结果没啥大的改变呢。8746726/8365217=1.04，几乎就是没有改进嘛.....难道是程序出现了问题，于是，我就将两个程序放在服务器上运行了。结果显示 1103530/185263=5.96，有近6倍的提，说明这样的优化程序是正确的，只是在VS2017的编译运行中没有体现出来，可这又是为啥？

经过一番查询资料，发现VS2017中我在编译运行的时候用的是debug模式，其实需要改成Release模式。

debug.png

Release.png

修改完编译器中的模式之后，发现结果果然不一样了，1060761/166149 = 6.38，说明优化后的程序有很大的提升，因此，得出一个结果，VS中的debug模式只能用来修改程序中的错误，但是不能说明真正的程序性能的好坏，谨记，要改为Release模式。

优化前.png

优化后.png

更进一步的优化，多block多thread

CUDA不仅提供了Thread，还提供了Grid和Block以及Share Memory这些非常重要的机制，每个block中Thread极限是1024，但是通过block和Grid，线程的数量还能成倍增长，甚至用几万个线程。

/*
计算随机数向量立方和的CUDA程序
*/
#include "cuda_runtime.h"
#include <time.h>  // 为了使用clock_t
#include <stdio.h>
#include <stdlib.h>  // 为了使用rand 函数

//#include "device_launch_parameters.h"
//#include "device_functions.h"
//#include <cuda.h> 
//#include <malloc.h>

#define DATA_SIZE 1048576 //数据
#define THREAD_NUM 256 //thread 数量
#define BLOCK_NUM  32// block 数量 
//配置32 个blocks，每个blocks 有256个threads，总共有32*256= 8192个threads。
int data[DATA_SIZE];

//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size) {
    for (int i = 0; i < size; i++) {
        number[i] = rand() % 10;  // 0~32767之间的随机数 
    }
}

//CUDA 初始化
bool InitCUDA() {
    int count;
    cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    int i;
    for (i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                break;
            }
        }
    }
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }
    cudaSetDevice(i);
    return true;
}
// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int *result, clock_t *time) {
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    int sum = 0;
    int i;

    //只在thread 0（即threadIdx.x = 0 的时候）进行记录，每个block 都会记录开始时间及结束时间
    if (tid == 0) time[bid] = clock();
    //thread需要同时通过tid和bid来确定，并保证内存连续性
    for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {
        sum += num[i] * num[i] * num[i];
    }
    //Result的数量随之增加
    result[bid * THREAD_NUM + tid] = sum;
    //计算时间的动作，只在thread 0（即threadIdx.x = 0 的时候）进行，每个block 都会记录开始时间及结束时间
    if (tid == 0) time[bid + BLOCK_NUM] = clock();
}

int main() {
    if (!InitCUDA()) { return 0; }//CUDA 初始化
    GenerateNumbers(data, DATA_SIZE); //生成随机数
    int *gpudata, *result;  // device中的参数
    clock_t *time;

    cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
    cudaMalloc((void**)&time, sizeof(clock_t)*BLOCK_NUM * 2);
    cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM*BLOCK_NUM);

    //cudaMemcpy 将数据从cpu复制到device内存中
    cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
    // 启动kernel函数
    sumOfcubes << <BLOCK_NUM, THREAD_NUM, 0 >> > (gpudata, result, time); // gpudata, result,time 是传给设备本身的参数

    int gpusum[THREAD_NUM*BLOCK_NUM];  // CPU中记录GPU中记录求和的参数
    clock_t time_use[BLOCK_NUM * 2];

    //cudaMemcpy 将结果从显存中复制回CPU内存
    cudaMemcpy(&gpusum, result, sizeof(int) * THREAD_NUM * BLOCK_NUM, cudaMemcpyDeviceToHost);
    cudaMemcpy(&time_use, time, sizeof(clock_t)*BLOCK_NUM * 2, cudaMemcpyDeviceToHost);

    cudaFree(gpudata);
    cudaFree(result);
    cudaFree(time);

    int final_sum = 0; /*立方和归约*/
    for (int i = 0; i < THREAD_NUM * BLOCK_NUM; i++) {
        final_sum += gpusum[i];
    }

    //采取新的计时策略把每个block 最早的开始时间，和最晚的结束时间相减，取得总运行时间
    clock_t min_start, max_end;
    min_start = time_use[0];
    max_end = time_use[BLOCK_NUM];
    for (int i = 1; i < BLOCK_NUM; i++) {  //把每个block最早的开始时间和最晚结束的时间相减，取得最终的运行时间
        if (min_start > time_use[i])
            min_start = time_use[i];
        if (max_end < time_use[i + BLOCK_NUM])
            max_end = time_use[i + BLOCK_NUM];
    }
    printf("GPUsum: %d gputime: %ld\n", final_sum, max_end - min_start);


    // 计算CPU中计算时的结果。
    final_sum = 0;
    for (int i = 0; i < DATA_SIZE; i++) {
        final_sum += data[i] * data[i] * data[i];
    }
    printf("CPUsum: %d \n", final_sum);
    return 0;
}

会出现以下错误：

error.png

解决方案：release模式会优化很多东西，但是多重新编译几次，又会成功，真是搞笑，搞不懂这些编译器......... 算了，还是建议去服务器上运行吧，VS中实在有太多无法未知的错误了。

程序运行结果：

Release_threads256_blocks32.png

后来修改 THREAD_NUM = 1024 ， BLOCK_NUM = 128 ，总共 1024 * 128 = 131073 个threads，性能并没有很大提升了。

 #define THREAD_NUM 1024  //thread 数量
 #define BLOCK_NUM 128 // block 数量

Release_threads1024_blocks128.png

为什么不用极限的1024个线程呢？

如果1024个线程全部用上，那样就是32*1024 = 32768 个线程，难道不是更好吗？从线程运行的原理来看，线程数量达到一定大小后，再一味地增加线程也不会取得性能的提升，反而有可能会让性能下降。线程数达到隐藏各种latency的程度后，之后线程数量的提升就没有太大的作用了。

程序中加和部分是在CPU上进行的，越多的线程意味着越多的结果，而这也意味着CPU上的运算压力会越来越大。

Thread 的同步

一个block内的所有的 thread 可以有共享的内存，也可以进行同步，因此，还可以让每个block 内所有 thread 将自己的计算结果相加起来。代码可以进一步修改：

#include "cuda_runtime.h"
#include <time.h>  // 为了使用clock_t
#include <stdio.h>
#include <stdlib.h>  // 为了使用rand 函数

//#include "device_launch_parameters.h"
//#include "device_functions.h"
//#include <cuda.h> 
//#include <malloc.h>

#define DATA_SIZE 1048576 //数据
#define THREAD_NUM 256 //thread 数量
#define BLOCK_NUM 32// block 数量 
//配置32 个blocks，每个blocks 有256个threads，总共有32*256= 8192个threads。
int data[DATA_SIZE];

//产生大量0-9之间的随机数
void GenerateNumbers(int *number, int size) {
    for (int i = 0; i < size; i++) {
        number[i] = rand() % 10;  // 0~32767之间的随机数 
    }
}

//CUDA 初始化
bool InitCUDA() {
    int count;
    cudaGetDeviceCount(&count);//取得支持Cuda的装置的数目
    if (count == 0) {
        fprintf(stderr, "There is no device.\n");
        return false;
    }
    int i;
    for (i = 0; i < count; i++) {
        cudaDeviceProp prop;
        if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
            if (prop.major >= 1) {
                break;
            }
        }
    }
    if (i == count) {
        fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
        return false;
    }
    cudaSetDevice(i);
    return true;
}
// __global__ 函数(GPU上执行) 计算立方和

__global__ static void sumOfcubes(int *num, int *result, clock_t *time) {
    extern __shared__ int shared[];
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    int i;

    //只在thread 0（即threadIdx.x = 0 的时候）进行记录，每个block 都会记录开始时间及结束时间
    if (tid == 0) time[bid] = clock(); 
    shared[tid] = 0;
    //thread需要同时通过tid和bid来确定，并保证内存连续性
    for (i = bid * THREAD_NUM + tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM) {
        shared[tid] += num[i] * num[i] * num[i];
    }


    __syncthreads();
    if (tid == 0) {
        for (i = 1; i < THREAD_NUM; i++) {
            shared[0] += shared[i];
        }
        result[tid] = shared[0];
    }
    if (tid == 0) time[bid + BLOCK_NUM] = clock();
}

int main() {
    if (!InitCUDA()) { return 0; }//CUDA 初始化
    GenerateNumbers(data, DATA_SIZE); //生成随机数
    int *gpudata, *result;  // device中的参数
    clock_t *time;

    cudaMalloc((void**)&gpudata, sizeof(int)* DATA_SIZE);
    cudaMalloc((void**)&time, sizeof(clock_t)*BLOCK_NUM * 2);
    cudaMalloc((void**)&result, sizeof(int)*BLOCK_NUM);

    cudaMemcpy(gpudata, data, sizeof(int)* DATA_SIZE, cudaMemcpyHostToDevice);
    // 启动kernel函数
    sumOfcubes << <BLOCK_NUM, THREAD_NUM, sizeof(int)* THREAD_NUM >> > (gpudata, result, time); // gpudata, result,time 是传给设备本身的参数

    int gpusum[BLOCK_NUM];  // CPU中记录GPU中记录求和的参数
    clock_t time_use[BLOCK_NUM * 2];

    cudaMemcpy(&gpusum, result, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);
    cudaMemcpy(&time_use, time, sizeof(clock_t)*BLOCK_NUM * 2, cudaMemcpyDeviceToHost);

    cudaFree(gpudata);
    cudaFree(result);
    cudaFree(time);

    int final_sum = 0; /*立方和归约*/
    for (int i = 0; i <BLOCK_NUM; i++) {
        final_sum += gpusum[i];
    }

    //采取新的计时策略把每个block 最早的开始时间，和最晚的结束时间相减，取得总运行时间
    clock_t min_start, max_end;
    min_start = time_use[0];
    max_end = time_use[BLOCK_NUM];
    for (int i = 1; i < BLOCK_NUM; i++) {  //把每个block最早的开始时间和最晚结束的时间相减，取得最终的运行时间
        if (min_start > time_use[i])
            min_start = time_use[i];
        if (max_end < time_use[i + BLOCK_NUM])
            max_end = time_use[i + BLOCK_NUM];
    }
    printf("GPUsum: %d gputime: %ld\n", final_sum, max_end - min_start);


    // 计算CPU中计算时的结果。
    final_sum = 0;
    for (int i = 0; i < DATA_SIZE; i++) {
        final_sum += data[i] * data[i] * data[i];
    }
    printf("CPUsum: %d \n", final_sum);
    return 0;
}

树状加法

参考：
http://hpc.pku.edu.cn/docs/20170829223804057249.pdf 深入浅出谈CUDA

#include "device_functions.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>

#define BLOCK_DIM 16


/**
 * Computes the squared Euclidean distance matrix between the query points and the reference points.
 *
 * @param ref          refence points stored in the global memory
 * @param ref_width    number of reference points
 * @param ref_pitch    pitch of the reference points array in number of column
 * @param query        query points stored in the global memory
 * @param query_width  number of query points
 * @param query_pitch  pitch of the query points array in number of columns
 * @param height       dimension of points = height of texture `ref` and of the array `query`
 * @param dist         array containing the query_width x ref_width computed distances
 */
__global__ void compute_distances(float * ref,
    int     ref_width,
    int     ref_pitch,
    float * query,
    int     query_width,
    int     query_pitch,
    int     height,
    float * dist) {

    // Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B
    __shared__ float shared_A[BLOCK_DIM][BLOCK_DIM];
    __shared__ float shared_B[BLOCK_DIM][BLOCK_DIM];

    // Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step)
    __shared__ int begin_A;
    __shared__ int begin_B;
    __shared__ int step_A;
    __shared__ int step_B;
    __shared__ int end_A;

    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Initializarion of the SSD for the current thread
    float ssd = 0.f;

    // Loop parameters
    begin_A = BLOCK_DIM * blockIdx.y;
    begin_B = BLOCK_DIM * blockIdx.x;
    step_A = BLOCK_DIM * ref_pitch;
    step_B = BLOCK_DIM * query_pitch;
    end_A = begin_A + (height - 1) * ref_pitch;

    // Conditions
    int cond0 = (begin_A + tx < ref_width); // used to write in shared memory
    int cond1 = (begin_B + tx < query_width); // used to write in shared memory & to computations and to write in output array 
    int cond2 = (begin_A + ty < ref_width); // used to computations and to write in output matrix

    // Loop over all the sub-matrices of A and B required to compute the block sub-matrix
    for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) {

        // Load the matrices from device memory to shared memory; each thread loads one element of each matrix
        if (a / ref_pitch + ty < height) {
            shared_A[ty][tx] = (cond0) ? ref[a + ref_pitch * ty + tx] : 0;
            shared_B[ty][tx] = (cond1) ? query[b + query_pitch * ty + tx] : 0;
        }
        else {
            shared_A[ty][tx] = 0;
            shared_B[ty][tx] = 0;
        }

        // Synchronize to make sure the matrices are loaded
        __syncthreads();

        // Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix
        if (cond2 && cond1) {
            for (int k = 0; k < BLOCK_DIM; ++k) {
                float tmp = shared_A[k][ty] - shared_B[k][tx];
                ssd += tmp * tmp;
            }
        }

        // Synchronize to make sure that the preceeding computation is done before loading two new sub-matrices of A and B in the next iteration
        __syncthreads();
    }

    // Write the block sub-matrix to device memory; each thread writes one element
    if (cond2 && cond1) {
        dist[(begin_A + ty) * query_pitch + begin_B + tx] = ssd;
    }
}


/**
 * Computes the squared Euclidean distance matrix between the query points and the reference points.
 *
 * @param ref          refence points stored in the texture memory
 * @param ref_width    number of reference points
 * @param query        query points stored in the global memory
 * @param query_width  number of query points
 * @param query_pitch  pitch of the query points array in number of columns
 * @param height       dimension of points = height of texture `ref` and of the array `query`
 * @param dist         array containing the query_width x ref_width computed distances
 */
__global__ void compute_distance_texture(cudaTextureObject_t ref,
    int                 ref_width,
    float *             query,
    int                 query_width,
    int                 query_pitch,
    int                 height,
    float*              dist) {
    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
    if (xIndex < query_width && yIndex < ref_width) {
        float ssd = 0.f;
        for (int i = 0; i < height; i++) {
            float tmp = tex2D<float>(ref, (float)yIndex, (float)i) - query[i * query_pitch + xIndex];
            ssd += tmp * tmp;
        }
        dist[yIndex * query_pitch + xIndex] = ssd;
    }
}


/**
 * For each reference point (i.e. each column) finds the k-th smallest distances
 * of the distance matrix and their respective indexes and gathers them at the top
 * of the 2 arrays.
 *
 * Since we only need to locate the k smallest distances, sorting the entire array
 * would not be very efficient if k is relatively small. Instead, we perform a
 * simple insertion sort by eventually inserting a given distance in the first
 * k values.
 *
 * @param dist         distance matrix
 * @param dist_pitch   pitch of the distance matrix given in number of columns
 * @param index        index matrix
 * @param index_pitch  pitch of the index matrix given in number of columns
 * @param width        width of the distance matrix and of the index matrix
 * @param height       height of the distance matrix
 * @param k            number of values to find
 */
__global__ void modified_insertion_sort(float * dist,
    int     dist_pitch,
    int *   index,
    int     index_pitch,
    int     width,
    int     height,
    int     k) {

    // Column position
    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

    // Do nothing if we are out of bounds
    if (xIndex < width) {

        // Pointer shift
        float * p_dist = dist + xIndex;
        int *   p_index = index + xIndex;

        // Initialise the first index
        p_index[0] = 0;

        // Go through all points
        for (int i = 1; i < height; ++i) {

            // Store current distance and associated index
            float curr_dist = p_dist[i*dist_pitch];
            int   curr_index = i;

            // Skip the current value if its index is >= k and if it's higher the k-th slready sorted mallest value
            if (i >= k && curr_dist >= p_dist[(k - 1)*dist_pitch]) {
                continue;
            }

            // Shift values (and indexes) higher that the current distance to the right
            int j = min(i, k - 1);
            while (j > 0 && p_dist[(j - 1)*dist_pitch] > curr_dist) {
                p_dist[j*dist_pitch] = p_dist[(j - 1)*dist_pitch];
                p_index[j*index_pitch] = p_index[(j - 1)*index_pitch];
                --j;
            }

            // Write the current distance and index at their position
            p_dist[j*dist_pitch] = curr_dist;
            p_index[j*index_pitch] = curr_index;
        }
    }
}


/**
 * Computes the square root of the first k lines of the distance matrix.
 *
 * @param dist   distance matrix
 * @param width  width of the distance matrix
 * @param pitch  pitch of the distance matrix given in number of columns
 * @param k      number of values to consider
 */
__global__ void compute_sqrt(float * dist, int width, int pitch, int k) {
    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
    if (xIndex < width && yIndex < k)
        dist[yIndex*pitch + xIndex] = sqrt(dist[yIndex*pitch + xIndex]);
}


/**
 * Computes the squared norm of each column of the input array.
 *
 * @param array   input array
 * @param width   number of columns of `array` = number of points
 * @param pitch   pitch of `array` in number of columns
 * @param height  number of rows of `array` = dimension of the points
 * @param norm    output array containing the squared norm values
 */
__global__ void compute_squared_norm(float * array, int width, int pitch, int height, float * norm) {
    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    if (xIndex < width) {
        float sum = 0.f;
        for (int i = 0; i < height; i++) {
            float val = array[i*pitch + xIndex];
            sum += val * val;
        }
        norm[xIndex] = sum;
    }
}


/**
 * Add the reference points norm (column vector) to each colum of the input array.
 *
 * @param array   input array
 * @param width   number of columns of `array` = number of points
 * @param pitch   pitch of `array` in number of columns
 * @param height  number of rows of `array` = dimension of the points
 * @param norm    reference points norm stored as a column vector
 */
__global__ void add_reference_points_norm(float * array, int width, int pitch, int height, float * norm) {
    unsigned int tx = threadIdx.x;
    unsigned int ty = threadIdx.y;
    unsigned int xIndex = blockIdx.x * blockDim.x + tx;
    unsigned int yIndex = blockIdx.y * blockDim.y + ty;
    __shared__ float shared_vec[16];
    if (tx == 0 && yIndex < height)
        shared_vec[ty] = norm[yIndex];
    __syncthreads();
    if (xIndex < width && yIndex < height)
        array[yIndex*pitch + xIndex] += shared_vec[ty];
}


/**
 * Adds the query points norm (row vector) to the k first lines of the input
 * array and computes the square root of the resulting values.
 *
 * @param array   input array
 * @param width   number of columns of `array` = number of points
 * @param pitch   pitch of `array` in number of columns
 * @param k       number of neighbors to consider
 * @param norm     query points norm stored as a row vector
 */
__global__ void add_query_points_norm_and_sqrt(float * array, int width, int pitch, int k, float * norm) {
    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
    if (xIndex < width && yIndex < k)
        array[yIndex*pitch + xIndex] = sqrt(array[yIndex*pitch + xIndex] + norm[xIndex]);
}


bool knn_cuda_global(const float * ref,
    int           ref_nb,
    const float * query,
    int           query_nb,
    int           dim,
    int           k,
    float *       knn_dist,
    int *         knn_index) {

    // Constants
    const unsigned int size_of_float = sizeof(float);
    const unsigned int size_of_int = sizeof(int);

    // Return variables
    cudaError_t err0, err1, err2, err3;

    // Check that we have at least one CUDA device 
    int nb_devices;
    err0 = cudaGetDeviceCount(&nb_devices);
    if (err0 != cudaSuccess || nb_devices == 0) {
        printf("ERROR: No CUDA device found\n");
        return false;
    }

    // Select the first CUDA device as default
    err0 = cudaSetDevice(0);
    if (err0 != cudaSuccess) {
        printf("ERROR: Cannot set the chosen CUDA device\n");
        return false;
    }

    // Allocate global memory
    float * ref_dev = NULL;
    float * query_dev = NULL;
    float * dist_dev = NULL;
    int   * index_dev = NULL;
    size_t  ref_pitch_in_bytes;
    size_t  query_pitch_in_bytes;
    size_t  dist_pitch_in_bytes;
    size_t  index_pitch_in_bytes;
    err0 = cudaMallocPitch((void**)&ref_dev, &ref_pitch_in_bytes, ref_nb   * size_of_float, dim);
    err1 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);
    err2 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);
    err3 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);
    if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess || err3 != cudaSuccess) {
        printf("ERROR: Memory allocation error\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Deduce pitch values
    size_t ref_pitch = ref_pitch_in_bytes / size_of_float;
    size_t query_pitch = query_pitch_in_bytes / size_of_float;
    size_t dist_pitch = dist_pitch_in_bytes / size_of_float;
    size_t index_pitch = index_pitch_in_bytes / size_of_int;

    // Check pitch values
    if (query_pitch != dist_pitch || query_pitch != index_pitch) {
        printf("ERROR: Invalid pitch value\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Copy reference and query data from the host to the device
    err0 = cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref, ref_nb * size_of_float, ref_nb * size_of_float, dim, cudaMemcpyHostToDevice);
    err1 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);
    if (err0 != cudaSuccess || err1 != cudaSuccess) {
        printf("ERROR: Unable to copy data from host to device\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Compute the squared Euclidean distances
    dim3 block0(BLOCK_DIM, BLOCK_DIM, 1);
    dim3 grid0(query_nb / BLOCK_DIM, ref_nb / BLOCK_DIM, 1);
    if (query_nb % BLOCK_DIM != 0) grid0.x += 1;
    if (ref_nb   % BLOCK_DIM != 0) grid0.y += 1;
    compute_distances << <grid0, block0 >> > (ref_dev, ref_nb, ref_pitch, query_dev, query_nb, query_pitch, dim, dist_dev);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Sort the distances with their respective indexes
    dim3 block1(256, 1, 1);
    dim3 grid1(query_nb / 256, 1, 1);
    if (query_nb % 256 != 0) grid1.x += 1;
    modified_insertion_sort << <grid1, block1 >> > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Compute the square root of the k smallest distances
    dim3 block2(16, 16, 1);
    dim3 grid2(query_nb / 16, k / 16, 1);
    if (query_nb % 16 != 0) grid2.x += 1;
    if (k % 16 != 0)        grid2.y += 1;
    compute_sqrt << <grid2, block2 >> > (dist_dev, query_nb, query_pitch, k);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Copy k smallest distances / indexes from the device to the host
    err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);
    err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);
    if (err0 != cudaSuccess || err1 != cudaSuccess) {
        printf("ERROR: Unable to copy data from device to host\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Memory clean-up
    cudaFree(ref_dev);
    cudaFree(query_dev);
    cudaFree(dist_dev);
    cudaFree(index_dev);

    return true;
}


bool knn_cuda_texture(const float * ref,
    int           ref_nb,
    const float * query,
    int           query_nb,
    int           dim,
    int           k,
    float *       knn_dist,
    int *         knn_index) {

    // Constants
    unsigned int size_of_float = sizeof(float);
    unsigned int size_of_int = sizeof(int);

    // Return variables
    cudaError_t err0, err1, err2;

    // Check that we have at least one CUDA device 
    int nb_devices;
    err0 = cudaGetDeviceCount(&nb_devices);
    if (err0 != cudaSuccess || nb_devices == 0) {
        printf("ERROR: No CUDA device found\n");
        return false;
    }

    // Select the first CUDA device as default
    err0 = cudaSetDevice(0);
    if (err0 != cudaSuccess) {
        printf("ERROR: Cannot set the chosen CUDA device\n");
        return false;
    }

    // Allocate global memory
    float * query_dev = NULL;
    float * dist_dev = NULL;
    int *   index_dev = NULL;
    size_t  query_pitch_in_bytes;
    size_t  dist_pitch_in_bytes;
    size_t  index_pitch_in_bytes;
    err0 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);
    err1 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);
    err2 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);
    if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess) {
        printf("ERROR: Memory allocation error (cudaMallocPitch)\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Deduce pitch values
    size_t query_pitch = query_pitch_in_bytes / size_of_float;
    size_t dist_pitch = dist_pitch_in_bytes / size_of_float;
    size_t index_pitch = index_pitch_in_bytes / size_of_int;

    // Check pitch values
    if (query_pitch != dist_pitch || query_pitch != index_pitch) {
        printf("ERROR: Invalid pitch value\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Copy query data from the host to the device
    err0 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);
    if (err0 != cudaSuccess) {
        printf("ERROR: Unable to copy data from host to device\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Allocate CUDA array for reference points
    cudaArray* ref_array_dev = NULL;
    cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    err0 = cudaMallocArray(&ref_array_dev, &channel_desc, ref_nb, dim);
    if (err0 != cudaSuccess) {
        printf("ERROR: Memory allocation error (cudaMallocArray)\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        return false;
    }

    // Copy reference points from host to device
    err0 = cudaMemcpyToArray(ref_array_dev, 0, 0, ref, ref_nb * size_of_float * dim, cudaMemcpyHostToDevice);
    if (err0 != cudaSuccess) {
        printf("ERROR: Unable to copy data from host to device\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        return false;
    }

    // Resource descriptor
    struct cudaResourceDesc res_desc;
    memset(&res_desc, 0, sizeof(res_desc));
    res_desc.resType = cudaResourceTypeArray;
    res_desc.res.array.array = ref_array_dev;

    // Texture descriptor
    struct cudaTextureDesc tex_desc;
    memset(&tex_desc, 0, sizeof(tex_desc));
    tex_desc.addressMode[0] = cudaAddressModeClamp;
    tex_desc.addressMode[1] = cudaAddressModeClamp;
    tex_desc.filterMode = cudaFilterModePoint;
    tex_desc.readMode = cudaReadModeElementType;
    tex_desc.normalizedCoords = 0;

    // Create the texture
    cudaTextureObject_t ref_tex_dev = 0;
    err0 = cudaCreateTextureObject(&ref_tex_dev, &res_desc, &tex_desc, NULL);
    if (err0 != cudaSuccess) {
        printf("ERROR: Unable to create the texture\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        return false;
    }

    // Compute the squared Euclidean distances
    dim3 block0(16, 16, 1);
    dim3 grid0(query_nb / 16, ref_nb / 16, 1);
    if (query_nb % 16 != 0) grid0.x += 1;
    if (ref_nb % 16 != 0) grid0.y += 1;
    compute_distance_texture << <grid0, block0 >> > (ref_tex_dev, ref_nb, query_dev, query_nb, query_pitch, dim, dist_dev);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        cudaDestroyTextureObject(ref_tex_dev);
        return false;
    }

    // Sort the distances with their respective indexes
    dim3 block1(256, 1, 1);
    dim3 grid1(query_nb / 256, 1, 1);
    if (query_nb % 256 != 0) grid1.x += 1;
    modified_insertion_sort << <grid1, block1 >> > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        cudaDestroyTextureObject(ref_tex_dev);
        return false;
    }

    // Compute the square root of the k smallest distances
    dim3 block2(16, 16, 1);
    dim3 grid2(query_nb / 16, k / 16, 1);
    if (query_nb % 16 != 0) grid2.x += 1;
    if (k % 16 != 0)        grid2.y += 1;
    compute_sqrt << <grid2, block2 >> > (dist_dev, query_nb, query_pitch, k);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        cudaDestroyTextureObject(ref_tex_dev);
        return false;
    }

    // Copy k smallest distances / indexes from the device to the host
    err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);
    err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);
    if (err0 != cudaSuccess || err1 != cudaSuccess) {
        printf("ERROR: Unable to copy data from device to host\n");
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFreeArray(ref_array_dev);
        cudaDestroyTextureObject(ref_tex_dev);
        return false;
    }

    // Memory clean-up
    cudaFree(query_dev);
    cudaFree(dist_dev);
    cudaFree(index_dev);
    cudaFreeArray(ref_array_dev);
    cudaDestroyTextureObject(ref_tex_dev);

    return true;
}


bool knn_cublas(const float * ref,
    int           ref_nb,
    const float * query,
    int           query_nb,
    int           dim,
    int           k,
    float *       knn_dist,
    int *         knn_index) {

    // Constants
    const unsigned int size_of_float = sizeof(float);
    const unsigned int size_of_int = sizeof(int);

    // Return variables
    cudaError_t  err0, err1, err2, err3, err4, err5;

    // Check that we have at least one CUDA device 
    int nb_devices;
    err0 = cudaGetDeviceCount(&nb_devices);
    if (err0 != cudaSuccess || nb_devices == 0) {
        printf("ERROR: No CUDA device found\n");
        return false;
    }

    // Select the first CUDA device as default
    err0 = cudaSetDevice(0);
    if (err0 != cudaSuccess) {
        printf("ERROR: Cannot set the chosen CUDA device\n");
        return false;
    }

    // Initialize CUBLAS
    cublasInit();

    // Allocate global memory
    float * ref_dev = NULL;
    float * query_dev = NULL;
    float * dist_dev = NULL;
    int   * index_dev = NULL;
    float * ref_norm_dev = NULL;
    float * query_norm_dev = NULL;
    size_t  ref_pitch_in_bytes;
    size_t  query_pitch_in_bytes;
    size_t  dist_pitch_in_bytes;
    size_t  index_pitch_in_bytes;
    err0 = cudaMallocPitch((void**)&ref_dev, &ref_pitch_in_bytes, ref_nb   * size_of_float, dim);
    err1 = cudaMallocPitch((void**)&query_dev, &query_pitch_in_bytes, query_nb * size_of_float, dim);
    err2 = cudaMallocPitch((void**)&dist_dev, &dist_pitch_in_bytes, query_nb * size_of_float, ref_nb);
    err3 = cudaMallocPitch((void**)&index_dev, &index_pitch_in_bytes, query_nb * size_of_int, k);
    err4 = cudaMalloc((void**)&ref_norm_dev, ref_nb   * size_of_float);
    err5 = cudaMalloc((void**)&query_norm_dev, query_nb * size_of_float);
    if (err0 != cudaSuccess || err1 != cudaSuccess || err2 != cudaSuccess || err3 != cudaSuccess || err4 != cudaSuccess || err5 != cudaSuccess) {
        printf("ERROR: Memory allocation error\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Deduce pitch values
    size_t ref_pitch = ref_pitch_in_bytes / size_of_float;
    size_t query_pitch = query_pitch_in_bytes / size_of_float;
    size_t dist_pitch = dist_pitch_in_bytes / size_of_float;
    size_t index_pitch = index_pitch_in_bytes / size_of_int;

    // Check pitch values
    if (query_pitch != dist_pitch || query_pitch != index_pitch) {
        printf("ERROR: Invalid pitch value\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Copy reference and query data from the host to the device
    err0 = cudaMemcpy2D(ref_dev, ref_pitch_in_bytes, ref, ref_nb * size_of_float, ref_nb * size_of_float, dim, cudaMemcpyHostToDevice);
    err1 = cudaMemcpy2D(query_dev, query_pitch_in_bytes, query, query_nb * size_of_float, query_nb * size_of_float, dim, cudaMemcpyHostToDevice);
    if (err0 != cudaSuccess || err1 != cudaSuccess) {
        printf("ERROR: Unable to copy data from host to device\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Compute the squared norm of the reference points
    dim3 block0(256, 1, 1);
    dim3 grid0(ref_nb / 256, 1, 1);
    if (ref_nb % 256 != 0) grid0.x += 1;
    compute_squared_norm << <grid0, block0 >> > (ref_dev, ref_nb, ref_pitch, dim, ref_norm_dev);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Compute the squared norm of the query points
    dim3 block1(256, 1, 1);
    dim3 grid1(query_nb / 256, 1, 1);
    if (query_nb % 256 != 0) grid1.x += 1;
    compute_squared_norm << <grid1, block1 >> > (query_dev, query_nb, query_pitch, dim, query_norm_dev);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Computation of query*transpose(reference)
    cublasSgemm('n', 't', (int)query_pitch, (int)ref_pitch, dim, (float)-2.0, query_dev, query_pitch, ref_dev, ref_pitch, (float)0.0, dist_dev, query_pitch);
    if (cublasGetError() != CUBLAS_STATUS_SUCCESS) {
        printf("ERROR: Unable to execute cublasSgemm\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Add reference points norm
    dim3 block2(16, 16, 1);
    dim3 grid2(query_nb / 16, ref_nb / 16, 1);
    if (query_nb % 16 != 0) grid2.x += 1;
    if (ref_nb % 16 != 0) grid2.y += 1;
    add_reference_points_norm << <grid2, block2 >> > (dist_dev, query_nb, dist_pitch, ref_nb, ref_norm_dev);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Sort each column
    modified_insertion_sort << <grid1, block1 >> > (dist_dev, dist_pitch, index_dev, index_pitch, query_nb, ref_nb, k);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Add query norm and compute the square root of the of the k first elements
    dim3 block3(16, 16, 1);
    dim3 grid3(query_nb / 16, k / 16, 1);
    if (query_nb % 16 != 0) grid3.x += 1;
    if (k % 16 != 0) grid3.y += 1;
    add_query_points_norm_and_sqrt << <grid3, block3 >> > (dist_dev, query_nb, dist_pitch, k, query_norm_dev);
    if (cudaGetLastError() != cudaSuccess) {
        printf("ERROR: Unable to execute kernel\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Copy k smallest distances / indexes from the device to the host
    err0 = cudaMemcpy2D(knn_dist, query_nb * size_of_float, dist_dev, dist_pitch_in_bytes, query_nb * size_of_float, k, cudaMemcpyDeviceToHost);
    err1 = cudaMemcpy2D(knn_index, query_nb * size_of_int, index_dev, index_pitch_in_bytes, query_nb * size_of_int, k, cudaMemcpyDeviceToHost);
    if (err0 != cudaSuccess || err1 != cudaSuccess) {
        printf("ERROR: Unable to copy data from device to host\n");
        cudaFree(ref_dev);
        cudaFree(query_dev);
        cudaFree(dist_dev);
        cudaFree(index_dev);
        cudaFree(ref_norm_dev);
        cudaFree(query_norm_dev);
        cublasShutdown();
        return false;
    }

    // Memory clean-up and CUBLAS shutdown
    cudaFree(ref_dev);
    cudaFree(query_dev);
    cudaFree(dist_dev);
    cudaFree(index_dev);
    cudaFree(ref_norm_dev);
    cudaFree(query_norm_dev);
    cublasShutdown();

    return true;
}

CUDA C初学者编程（VS2017）

运行以下代码，查看电脑设备。

查看当前系统上安装的GPU设备的详细参数

CUDA 初始化

完整的向量点积CUDA程序

计算随机数向量立方和的CUDA程序

计算随机数向量立方和的CUDA程序（加入统计时间的函数）

进一步优化上述求立方和的程序(利用单block多thread)

更进一步的优化，多block多thread

为什么不用极限的1024个线程呢？

Thread 的同步

树状加法

猜你喜欢

热点阅读

CUDA C初学者编程 （VS2017）

运行以下代码，查看电脑设备。

查看当前系统上安装的GPU设备的详细参数

CUDA 初始化

完整的向量点积CUDA程序

计算随机数向量立方和的CUDA程序

计算随机数向量立方和的CUDA程序（加入统计时间的函数）

进一步优化上述求立方和的程序(利用单block多thread)

更进一步的优化，多block多thread

为什么不用极限的1024个线程呢？

Thread 的同步

树状加法

猜你喜欢

热点阅读

CUDA C初学者编程（VS2017）