OpenCL_LAB2

2017-04-15  本文已影响0人  Bing2464

运行hello.cpp & 运行vadd.cpp

矩阵乘法

#include <iostream>
#include <fstream>
#include <cmath>
#include <cstring>
#if defined(__APPLE__) || defined(__MACOSX)
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

// 把文本文件读入一个 string 中
int convertToString(const char *filename, std::string &s) {
    size_t size;
    char *str;

    std::fstream f(filename, (std::fstream::in | std::fstream::binary));

    if (f.is_open()) {
        size_t fileSize;
        f.seekg(0, std::fstream::end);
        size = fileSize = (size_t) f.tellg();
        f.seekg(0, std::fstream::beg);

        str = new char[size + 1];

        f.read(str, fileSize);
        f.close();
        str[size] = '\0';

        s = str;
        delete[] str;
        return 0;
    }
    printf("Error: Failed to open file %s\n", filename);
    return 1;
}


int main(int argc, char *argv[]) {
    double cputime, gputime;
    clock_t timestamp;

    const int W = 100;
    const int mat_size = W * W;

    // 在 host 内存中创建三个缓冲区
    float *const buf1 = (float *) malloc(mat_size * sizeof(float));
    float *const buf2 = (float *) malloc(mat_size * sizeof(float));
    float *const buf = (float *) malloc(mat_size * sizeof(float));
    float *const op_data = (float *) malloc(mat_size * sizeof(float));

    // 初始化矩阵
    srand((unsigned int) time(NULL));
    for (int i = 0; i < mat_size; i++)
        buf1[i] = float(rand() % 1000) * M_PI;

    srand((unsigned int) time(NULL) + 1000);
    for (int i = 0; i < mat_size; i++)
        buf2[i] = float(rand() % 1000) * M_PI;

    // 时间戳
    timestamp = clock();

    for (int i = 0; i < mat_size; i++) {
        float tmp = 0.0;
        for (int k = 0; k < W; k++)
            tmp += buf1[i * W + k] * buf2[k * W + i];
        buf[i * W + i] = tmp;
    }

    cputime = (double) (clock() - timestamp) / CLOCKS_PER_SEC * 1000;
    printf("串行执行时间:%8.3f ms\n", cputime);

    cl_platform_id platform;
    cl_event prof_event;

    // 创建平台对象
    clGetPlatformIDs(1, &platform, NULL);
    cl_device_id device;

    // 创建 GPU 设备
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
    // 创建 context
    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
    // 创建命令队列
    cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);

    // 创建三个 OpenCL 内存对象
    cl_mem objects[3];
    objects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * mat_size, buf1,
                                   NULL);
    objects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * mat_size, buf2,
                                   NULL);
    objects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * mat_size, buf,
                                   NULL);

    const char *filename = "mul.cl";
    std::string sourceStr;
    convertToString(filename, sourceStr);
    const char *source = sourceStr.c_str();
    size_t sourceSize[] = {strlen(source)};

    cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
    // 编译程序对象
    clBuildProgram(program, 1, &device, NULL, NULL, NULL);

    // 创建 Kernel 对象
    cl_kernel kernel = clCreateKernel(program, "matrix_mult", NULL);

    // 设置 Kernel 参数
    clSetKernelArg(kernel, 0, sizeof(int), &W);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &objects[0]);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &objects[1]);
    clSetKernelArg(kernel, 3, sizeof(cl_mem), &objects[2]);


    //执行 kernel
    cl_ulong ev_start_time = (cl_ulong) 0;
    cl_ulong ev_end_time = (cl_ulong) 0;
    size_t global[1];
    global[0] = (size_t) W;
    clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, NULL, 0, NULL, &prof_event);

    clFinish(queue);

    //读取时间
    clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &ev_start_time, NULL);
    clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ev_end_time, NULL);
    gputime = (double) (ev_end_time - ev_start_time) * 1e-6;
    printf("OpenCL 执行时间:%8.3f ms\n", gputime);

    //数据拷回 host 内存
    clEnqueueReadBuffer(queue, objects[2], CL_TRUE, 0, sizeof(float) * mat_size, op_data, 0, NULL, NULL);

    // 验证 GPU 计算结果
    for (int i = 0; i < mat_size; i++) {
        if (fabs(buf[i] - op_data[i]) > 0.0001) {
            printf("check failed\n");
            break;
        }
    }

    if (buf1) 
        free(buf1);
    if (buf2) 
        free(buf2);
    if (buf) 
        free(buf);
    if (op_data) 
        free(op_data);

    // 删除 OpenCL 资源对象
    clReleaseMemObject(objects[2]);
    clReleaseMemObject(objects[1]);
    clReleaseMemObject(objects[0]);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    return 0;
}

上一篇下一篇

猜你喜欢

热点阅读