OpenMP 多线程加速矩阵乘法

2020-10-07  本文已影响0人  侠之大者_7d3f

MatMul测试结果

image.png image.png

测试代码

matmul.hpp


#pragma once
#include<assert.h>
#include<random>
#include<fstream>
#include<omp.h>
#include<chrono>

// 工具类,用于测量程序时间
class TimeElapsed{

private:
    std::chrono::high_resolution_clock::time_point start;
    std::chrono::high_resolution_clock::time_point end;

public:
    TimeElapsed(){
        start = std::chrono::high_resolution_clock::now();
    }

    ~TimeElapsed(){
        end = std::chrono::high_resolution_clock::now();
    }

    void reset(){
        start = std::chrono::high_resolution_clock::now();
    }

    double elapsed_ms(){
        end = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration<double, std::milli>(end - start).count();
        return duration;
    }

    double elapsed_us(){
        end = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration<double, std::micro>(end - start).count();
        return duration;
    }

    double elapsed_ns(){
        end = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration<double, std::nano>(end - start).count();
        return duration;
    }
};

// 矩阵Matrix类
template <class dtype>
class Matrix{
private:
    dtype* data;
    int numel_ ;
public:
    int rows;
    int cols;
    
public:
    Matrix(int rows, int cols){
        assert(rows>0&&cols>0);
        this->rows = rows;
        this->cols = cols;
        this->data = new dtype[rows*cols];
        assert(this->data!=nullptr);
        this->numel_ = rows * cols;
    }

    Matrix(){
        if(this->data!=nullptr)
            delete [] this->data;
    }

    int numel(){
        return this->numel_;
    }

    dtype operator()(int row_idx, int col_idx) const{
        return this->data[row_idx*this->cols+col_idx];
    }

    void setValue(int row_idx, int col_idx, dtype value){
        this->data[row_idx*this->cols + col_idx] = value;
    }

    void show(){
        
    }

    void save(std::string filename){
        std::ofstream out(filename,std::ios::out);
        out<<"========Matrix info=========="<<std::endl
           <<"rows:"<<this->rows<<std::endl
           <<"cols:"<<this->cols<<std::endl
           <<"============================="<<std::endl;
        for(int i=0;i<this->numel();i++){
            out<<this->data[i]<<std::endl;
        }

        out.close();

    }

};

// 生成随机矩阵
template<typename T>
Matrix<T> randMatrix(int rows, int cols){
    Matrix<T> mat = Matrix<T>(rows, cols);
    std::default_random_engine engine;
    std::uniform_real_distribution<T> u;
    for(int i=0;i<rows;i++){
        for(int j=0;j<cols;j++){
            mat.setValue(i,j,u(engine));
        }
    }
    return mat;
}

// 矩阵乘, 未优化版本
template<typename T>
Matrix<T> matmul_native(Matrix<T>& A, Matrix<T>& B){
    assert(A.cols == B.rows);
    int M = A.rows;
    int N = A.cols;
    int K = B.cols;

    Matrix<T> C = Matrix<T>(M,K);

    for(int i=0;i<M;i++){
        for(int j=0;j<K;j++){
            T tmp = static_cast<T>(0);
            for(int k=0;k<N;k++){
                tmp += A(i,k) * B(k,j);
            }
            C.setValue(i,j,tmp);
        }
    }
    return C;
}

// 矩阵乘, OpenMP优化版本
template<typename T>
Matrix<T> matmul_openmp(Matrix<T>& A, Matrix<T>& B, int thread_num){
    assert(A.cols == B.rows);
    int M = A.rows;
    int N = A.cols;
    int K = B.cols;

    Matrix<T> C = Matrix<T>(M,K);

    #pragma omp parallel for num_threads(thread_num)
    for(int i=0;i<M;i++){
        for(int j=0;j<K;j++){
            T tmp = static_cast<T>(0);
            for(int k=0;k<N;k++){
                tmp += A(i,k) * B(k,j);
            }
            C.setValue(i,j,tmp);
        }
    }
    return C;
}

main.cpp

#include<pthread.h>
#include<iostream>
#include<numeric>
#include<algorithm>
#include<vector>
#include"matmul.hpp"


struct InputMatrixs{
    Matrix<float>* a;
    Matrix<float>* b;
};

int LOOPS = 50;
int OMP_THREADS_NUM = 4;


void* test_native_matmul(void* args){
    InputMatrixs* inputs = (struct InputMatrixs*)args;
    Matrix<float> a = *(inputs->a);
    Matrix<float> b = *(inputs->b);

    // 统计时间,运行matmul, 计算时间
    TimeElapsed timer;
    static double time_cost = 0;
    for(int i=0;i<LOOPS;i++){
        timer.reset();
        Matrix<float> c = matmul_native<float>(a, b);
        time_cost+=timer.elapsed_ms();
    }
    time_cost /= LOOPS;
    return &time_cost;
}

void* test_omp_matmul(void* args){
    InputMatrixs* inputs = (struct InputMatrixs*)args;
    Matrix<float> a = *(inputs->a);
    Matrix<float> b = *(inputs->b);

    // 统计时间,运行matmul, 计算时间
    TimeElapsed timer;
    static double time_cost = 0;
    for(int i=0;i<LOOPS;i++){
        timer.reset();
        Matrix<float> c = matmul_openmp<float>(a, b, OMP_THREADS_NUM);
        time_cost+=timer.elapsed_ms();
    }
    time_cost /= LOOPS;
    return &time_cost;

}


int main(){

    /**
     * 测试矩阵乘法MatMul
     * 
     * C = MatMul(A,B)
     * 
     * Matrix A: M*N
     * Matrix B: N*k
     * Matrix C: M*K
     */

    // 创建矩阵
    int M = 2000;
    int N = 2000;
    int K = 2000;

    Matrix<float> A = randMatrix<float>(M, N);
    Matrix<float> B = randMatrix<float>(N, K);

    InputMatrixs inputs;
    inputs.a = &A;
    inputs.b = &B;

    // 利用pthread 创建2个线程
    // tid_native: 运行matmul_native  未优化版本的MatMul
    // tid_omp: 运行matmul_openmp, 采用OpenMP 优化之后的MatMul
    pthread_t tid_native, tid_omp;
    pthread_create(&tid_native, NULL, test_native_matmul, (void*)(&inputs));
    pthread_create(&tid_omp, NULL, test_omp_matmul, (void*)(&inputs));

    // 获取2个matmul函数的运行时间
    void* time_native = NULL;
    void* time_omp = NULL;

    pthread_join(tid_native, &time_native);
    pthread_join(tid_omp, &time_omp);

    printf("MatMul(native):%fms\n", *((double*)time_native));
    printf("MatMul(omp):%fms\n", *((double*)time_omp));

}
上一篇 下一篇

猜你喜欢

热点阅读