OpenMP 多线程加速矩阵乘法
2020-10-07 本文已影响0人
侠之大者_7d3f
MatMul测试结果
image.png image.png测试代码
matmul.hpp
#pragma once
#include<assert.h>
#include<random>
#include<fstream>
#include<omp.h>
#include<chrono>
// 工具类,用于测量程序时间
class TimeElapsed{
private:
std::chrono::high_resolution_clock::time_point start;
std::chrono::high_resolution_clock::time_point end;
public:
TimeElapsed(){
start = std::chrono::high_resolution_clock::now();
}
~TimeElapsed(){
end = std::chrono::high_resolution_clock::now();
}
void reset(){
start = std::chrono::high_resolution_clock::now();
}
double elapsed_ms(){
end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration<double, std::milli>(end - start).count();
return duration;
}
double elapsed_us(){
end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration<double, std::micro>(end - start).count();
return duration;
}
double elapsed_ns(){
end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration<double, std::nano>(end - start).count();
return duration;
}
};
// 矩阵Matrix类
template <class dtype>
class Matrix{
private:
dtype* data;
int numel_ ;
public:
int rows;
int cols;
public:
Matrix(int rows, int cols){
assert(rows>0&&cols>0);
this->rows = rows;
this->cols = cols;
this->data = new dtype[rows*cols];
assert(this->data!=nullptr);
this->numel_ = rows * cols;
}
Matrix(){
if(this->data!=nullptr)
delete [] this->data;
}
int numel(){
return this->numel_;
}
dtype operator()(int row_idx, int col_idx) const{
return this->data[row_idx*this->cols+col_idx];
}
void setValue(int row_idx, int col_idx, dtype value){
this->data[row_idx*this->cols + col_idx] = value;
}
void show(){
}
void save(std::string filename){
std::ofstream out(filename,std::ios::out);
out<<"========Matrix info=========="<<std::endl
<<"rows:"<<this->rows<<std::endl
<<"cols:"<<this->cols<<std::endl
<<"============================="<<std::endl;
for(int i=0;i<this->numel();i++){
out<<this->data[i]<<std::endl;
}
out.close();
}
};
// 生成随机矩阵
template<typename T>
Matrix<T> randMatrix(int rows, int cols){
Matrix<T> mat = Matrix<T>(rows, cols);
std::default_random_engine engine;
std::uniform_real_distribution<T> u;
for(int i=0;i<rows;i++){
for(int j=0;j<cols;j++){
mat.setValue(i,j,u(engine));
}
}
return mat;
}
// 矩阵乘, 未优化版本
template<typename T>
Matrix<T> matmul_native(Matrix<T>& A, Matrix<T>& B){
assert(A.cols == B.rows);
int M = A.rows;
int N = A.cols;
int K = B.cols;
Matrix<T> C = Matrix<T>(M,K);
for(int i=0;i<M;i++){
for(int j=0;j<K;j++){
T tmp = static_cast<T>(0);
for(int k=0;k<N;k++){
tmp += A(i,k) * B(k,j);
}
C.setValue(i,j,tmp);
}
}
return C;
}
// 矩阵乘, OpenMP优化版本
template<typename T>
Matrix<T> matmul_openmp(Matrix<T>& A, Matrix<T>& B, int thread_num){
assert(A.cols == B.rows);
int M = A.rows;
int N = A.cols;
int K = B.cols;
Matrix<T> C = Matrix<T>(M,K);
#pragma omp parallel for num_threads(thread_num)
for(int i=0;i<M;i++){
for(int j=0;j<K;j++){
T tmp = static_cast<T>(0);
for(int k=0;k<N;k++){
tmp += A(i,k) * B(k,j);
}
C.setValue(i,j,tmp);
}
}
return C;
}
main.cpp
#include<pthread.h>
#include<iostream>
#include<numeric>
#include<algorithm>
#include<vector>
#include"matmul.hpp"
struct InputMatrixs{
Matrix<float>* a;
Matrix<float>* b;
};
int LOOPS = 50;
int OMP_THREADS_NUM = 4;
void* test_native_matmul(void* args){
InputMatrixs* inputs = (struct InputMatrixs*)args;
Matrix<float> a = *(inputs->a);
Matrix<float> b = *(inputs->b);
// 统计时间,运行matmul, 计算时间
TimeElapsed timer;
static double time_cost = 0;
for(int i=0;i<LOOPS;i++){
timer.reset();
Matrix<float> c = matmul_native<float>(a, b);
time_cost+=timer.elapsed_ms();
}
time_cost /= LOOPS;
return &time_cost;
}
void* test_omp_matmul(void* args){
InputMatrixs* inputs = (struct InputMatrixs*)args;
Matrix<float> a = *(inputs->a);
Matrix<float> b = *(inputs->b);
// 统计时间,运行matmul, 计算时间
TimeElapsed timer;
static double time_cost = 0;
for(int i=0;i<LOOPS;i++){
timer.reset();
Matrix<float> c = matmul_openmp<float>(a, b, OMP_THREADS_NUM);
time_cost+=timer.elapsed_ms();
}
time_cost /= LOOPS;
return &time_cost;
}
int main(){
/**
* 测试矩阵乘法MatMul
*
* C = MatMul(A,B)
*
* Matrix A: M*N
* Matrix B: N*k
* Matrix C: M*K
*/
// 创建矩阵
int M = 2000;
int N = 2000;
int K = 2000;
Matrix<float> A = randMatrix<float>(M, N);
Matrix<float> B = randMatrix<float>(N, K);
InputMatrixs inputs;
inputs.a = &A;
inputs.b = &B;
// 利用pthread 创建2个线程
// tid_native: 运行matmul_native 未优化版本的MatMul
// tid_omp: 运行matmul_openmp, 采用OpenMP 优化之后的MatMul
pthread_t tid_native, tid_omp;
pthread_create(&tid_native, NULL, test_native_matmul, (void*)(&inputs));
pthread_create(&tid_omp, NULL, test_omp_matmul, (void*)(&inputs));
// 获取2个matmul函数的运行时间
void* time_native = NULL;
void* time_omp = NULL;
pthread_join(tid_native, &time_native);
pthread_join(tid_omp, &time_omp);
printf("MatMul(native):%fms\n", *((double*)time_native));
printf("MatMul(omp):%fms\n", *((double*)time_omp));
}