我有以下matlab代码;
tempx = full(sum(X.^2, 2));
tempc = full(sum(C.^2, 2).');
D = -2*(X * C.');
D = bsxfun(@plus, D, tempx);
D = bsxfun(@plus, D, tempc);
其中 X 是 nxm 而 W 是 kxm 矩阵。一个是数据,另一个是权重矩阵。我用给定的代码找到距离矩阵 D。我正在观看此操作的有效 Cublas 或 Thrust 实施。我D = -2*(X * C.');
通过 cublas 继承了这条线,但作为新手,剩余部分仍然是一个问题?任何人都可以提供片段或提供建议吗?
到目前为止,这是我所拥有的: 编辑:我添加了更多代码,并且我需要 bsxfun 之类的求和实现。将向量 V 与所有列相加,并将 V2 与所有行相加作为最后一步。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/system_error.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#define N 4
#define K 4
#define M 3
template <typename T>
struct square_op{
__host__ __device__ T operator()(const T& x)const{
return x*x;
}
};
int main (void){
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
// Fill with random data
thrust::host_vector<float> C_h(N*K);
thrust::host_vector<float> A_h(N*M); //data matrix
thrust::host_vector<float> B_h(K*M); //weight matrix
thrust::sequence(A_h.begin(),A_h.end());
thrust::sequence(B_h.begin(),B_h.end());
// std::generate(A_h.begin(), A_h.end(), rand);
// std::generate(B_h.begin(), B_h.end(), rand);
thrust::device_vector<float> A_d = A_h;
thrust::device_vector<float> B_d = B_h;
thrust::device_vector<float> C_d(N*K);
thrust::device_vector<float> dummy_x(M,1);
thrust::device_vector<float> A_sum_vec_d(N,0);
thrust::device_vector<float> B_sum_vec_d(K,0);
// TEST variables
thrust::host_vector<float> A_sum_vec_h(N,0);
thrust::host_vector<float> B_sum_vec_h(K,0);
for (int i = 0; i < N; ++i) {
for (int j = 0; j < M; ++j) {
printf("%f ",A_h[i*M+j]);
}
printf("\n");
}
printf("\n");
for (int i = 0; i < K; ++i) {
for (int j = 0; j < M; ++j) {
printf("%f ",B_h[i*M+j]);
}
printf("\n");
}
printf("\n");
std::cout<< "Starting GPU run" <<std::endl; //add this line
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//************************************
// Calculate Square Elements
//************************************
square_op<float> unary_op = square_op<float>();
thrust::transform(A_d.begin(),A_d.end(),A_d.begin(),unary_op);
thrust::transform(B_d.begin(),B_d.end(),B_d.begin(),unary_op);
// TEST
thrust::copy(A_d.begin(),A_d.end(), A_h.begin());
printf("Matrix A after square!!\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < M; ++j) {
printf("%f ",A_h[i*M+j]);
}
printf("\n");
}
printf("\n");
thrust::copy(B_d.begin(),B_d.end(), B_h.begin());
printf("Matrix B after square!!\n");
for (int i = 0; i < K; ++i) {
for (int j = 0; j < M; ++j) {
printf("%f ",B_h[i*M+j]);
}
printf("\n");
}
printf("\n");
//************************************
// Sum of the Rows
//************************************
float alpha = 1.0f;
float beta = 0.0f;
stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,N,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&A_sum_vec_d[0]),1);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("1 CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,K,&alpha,thrust::raw_pointer_cast(&B_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&B_sum_vec_d[0]),1);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("2 CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
// TEST
thrust::copy(A_sum_vec_d.begin(), A_sum_vec_d.end(), A_sum_vec_h.begin());
printf("A_vec after row sum!!\n");
for (int j = 0; j < N; ++j) {
printf("%f ",A_sum_vec_h[j]);
}
printf("\n \n");
thrust::copy(B_sum_vec_d.begin(), B_sum_vec_d.end(), B_sum_vec_h.begin());
printf("B_vec after row sum!!\n");
for (int j = 0; j < K; ++j) {
printf("%f ",B_sum_vec_h[j]);
}
printf("\n \n");
//************************************
// Matrix Multiplication
//************************************
alpha = 2.0f;
beta = 0.0f;
//alpha*(A*B')+beta in row_major_order
stat = cublasSgemm_v2(handle,CUBLAS_OP_T,CUBLAS_OP_N,N,K,M,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&B_d[0]), M, &beta,thrust::raw_pointer_cast(&C_d[0]), N);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
float totalTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
totalTime = elapsedTime/1000;
std::cout<<"Elapsed_time:"<< totalTime<<std::endl;
//copy back data
thrust::copy(C_d.begin(),C_d.end(),C_h.begin());
for (int i = 0; i < N; ++i) {
for (int j = 0; j < K; ++j) {
printf("%f ",C_h[i*K+j]);
}
printf("\n");
}
//************************************
// Final summation
//************************************
//.... NEED CODE
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
printf("Execution ends!!\n");
return EXIT_SUCCESS;
}