我正在尝试在 GPU 上实现矩阵向量乘法(使用 CUDA)。
在我的 C++ 代码 (CPU) 中,我将矩阵加载为密集矩阵,然后使用 CUDA 执行矩阵向量乘法。我还使用共享内存来提高性能。
- 知道我的矩阵是稀疏矩阵,如何以有效的方式加载矩阵?
下面是我加载矩阵的 C++ 函数:
int readMatrix( char* filename, float* &matrix, unsigned int *dim = NULL, int majority = ROW_MAJOR )
{
unsigned int w, h, x, y, num_entries;
float val;
std::ifstream file( filename );
if ( file )
{
file >> h >> w >> num_entries;
cout << w << " " << h << " " << num_entries << "\n";
assert( w == h || w == 1 || h == 1 );
if( dim != NULL ) *dim = std::max( w, h );
matrix = new float[ w * h ];
unsigned int i;
for( i = 0; i < num_entries; i++ ){
if( file.eof() ) break;
file >> y >> x >> val;
if( majority == ROW_MAJOR ){
matrix[ w * y + x ] = val;
} else if( majority == COLUMN_MAJOR ){
matrix[ h * x + y ] = val;
}
}
file.close();
if( i == num_entries )
std::cout << "\nFile read successfully\n";
else
std::cout << "\nFile read successfully but seems defective:\n num entries read = " << i << ", entries epected = " << num_entries << "\n";
// print first few elements
if( w == h ){
for( unsigned int i = 0; i < w; i++ ){
printf("\n");
for( unsigned int j = 0; j < h; j++ ){
printf("%.2f ", matrix[ j + w * i ] );
}
}
}
else{
printf("\n");
for( unsigned int j = 0; j < h; j++ ){
printf("%.2f ", matrix[ j ] );
}
}
} else {
std::cout << "Unable to open file\n";
return false;
}
return true;
}
下面是我处理矩阵向量乘法的 CUDA 内核函数:
__global__ void
_cl_matrix_vector_( float *A, float *b, float *x, int dim )
{
extern __shared__ float vec[];
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
float temp = 0.0;
int vOffs = 0;
//load vector into shared memory
for (int i = 0; i < (dim/blockDim.x) + 1 ; ++i, vOffs+= blockDim.x) {
vec[vOffs + threadIdx.x] = b[vOffs + threadIdx.x];
}
//make sure all threads are synchronized
__syncthreads();
if (idx < dim) {
temp = 0.0;
//dot product (multiplication)
for (int i = 0; i < dim; i++){
temp += A[idx * dim + i] * vec[i];
}
x[idx] = temp;
}
}
- 考虑到我的矩阵是稀疏矩阵,我必须对我的 CUDA 代码进行哪些必要的更改?
- 我从一个论坛发现我们也可以使用填充来优化性能,但这需要我改变读取矩阵/排序矩阵的方式。任何想法如何以我读取矩阵并执行计算的方式实现此填充?