我有一个M*N
主机内存矩阵,在复制到设备内存后,我需要将其转置为N*M
矩阵。是否有任何 cuda (cuBLAS ...) API 这样做?我正在使用 CUDA 4。谢谢!
问问题
7479 次
3 回答
9
为了回答您关于效率的问题,我比较了两种执行矩阵转置的方法,一种使用 Thrust 库,另一种使用cublas<t>geam
Robert Crovella 建议的方法。在 Kepler K20c 卡上的比较结果如下:
| Matrix size | Thrust [ms] | cuBLAS [ms] |
| | | |
| 32x32 | 0.015 | 0.016 |
| 64x64 | 0.015 | 0.017 |
| 128x128 | 0.019 | 0.017 |
| 256x256 | 0.028 | 0.017 |
| 512x512 | 0.088 | 0.042 |
| 1024x1024 | 0.34 | 0.13 |
| 2048x2048 | 1.24 | 0.48 |
| 4096x4096 | 11.02 | 1.98 |
可以看出,该cublas<t>geam
版本优于使用 Thrust 的版本。下面是执行比较的代码。
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/gather.h>
#include <thrust/scan.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <iostream>
#include <iomanip>
#include <cublas_v2.h>
#include <conio.h>
#include <assert.h>
/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif
inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
{
if( CUBLAS_STATUS_SUCCESS != err) {
fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err);
getch(); cudaDeviceReset(); assert(0);
}
}
// convert a linear index to a linear index in the transpose
struct transpose_index : public thrust::unary_function<size_t,size_t>
{
size_t m, n;
__host__ __device__
transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {}
__host__ __device__
size_t operator()(size_t linear_index)
{
size_t i = linear_index / n;
size_t j = linear_index % n;
return m * j + i;
}
};
// convert a linear index to a row index
struct row_index : public thrust::unary_function<size_t,size_t>
{
size_t n;
__host__ __device__
row_index(size_t _n) : n(_n) {}
__host__ __device__
size_t operator()(size_t i)
{
return i / n;
}
};
// transpose an M-by-N array
template <typename T>
void transpose(size_t m, size_t n, thrust::device_vector<T>& src, thrust::device_vector<T>& dst)
{
thrust::counting_iterator<size_t> indices(0);
thrust::gather
(thrust::make_transform_iterator(indices, transpose_index(n, m)),
thrust::make_transform_iterator(indices, transpose_index(n, m)) + dst.size(),
src.begin(),dst.begin());
}
// print an M-by-N array
template <typename T>
void print(size_t m, size_t n, thrust::device_vector<T>& d_data)
{
thrust::host_vector<T> h_data = d_data;
for(size_t i = 0; i < m; i++)
{
for(size_t j = 0; j < n; j++)
std::cout << std::setw(8) << h_data[i * n + j] << " ";
std::cout << "\n";
}
}
int main(void)
{
size_t m = 5; // number of rows
size_t n = 4; // number of columns
// 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ]
thrust::device_vector<double> data(m * n, 1.);
data[1] = 2.;
data[3] = 3.;
std::cout << "Initial array" << std::endl;
print(m, n, data);
std::cout << "Transpose array - Thrust" << std::endl;
thrust::device_vector<double> transposed_thrust(m * n);
transpose(m, n, data, transposed_thrust);
print(n, m, transposed_thrust);
std::cout << "Transpose array - cuBLAS" << std::endl;
thrust::device_vector<double> transposed_cuBLAS(m * n);
double* dv_ptr_in = thrust::raw_pointer_cast(data.data());
double* dv_ptr_out = thrust::raw_pointer_cast(transposed_cuBLAS.data());
double alpha = 1.;
double beta = 0.;
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
cublasSafeCall(cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, dv_ptr_in, n, &beta, dv_ptr_in, n, dv_ptr_out, m));
print(n, m, transposed_cuBLAS);
getch();
return 0;
}
于 2014-02-15T20:48:38.677 回答
8
在cublas API中:
cublas<t>geam()
This function performs the matrix-matrix addition/transposition
the user can transpose matrix A by setting *alpha=1 and *beta=0.
(并将 transa 运算符指定为 CUBLAS_OP_T 进行转置)
于 2013-03-17T07:52:24.990 回答
1
CULA 有辅助例程来计算转置 ( culaDevice?geTranspose
)。如果是方阵,您还可以使用就地转置 ( culaDevise?geTransposeInplace
)。
注意:如果您满足某些条件,CULA 有免费许可证可用。
于 2013-05-02T06:57:48.747 回答