内核update_umatrix
无法启动,分析器显示它需要-100%
!计算的时间。
这可能是一个简单的问题,但我已经花了两个星期的时间,但内核启动仍然无法根据 nsight 分析器启动,至少U
矩阵没有更新并且包含全零(这是 FCM 的部分实现)。
我的 GPU 是 GeForce 330M,计算能力为1.2
.
float *U;
float *V;
float *X;
__device__ float *U_d;
__device__ float *V_d;
__device__ float *X_d;
__global__ void update_umatrix(float *sqrerror,int C,int N,int S,float m)
{
int i,j,k;
int example_is_centroid;
float summation, D_ki, D_kj;
float newU;
__shared__ float tmp_sqrerror[DIM];
/* For each example in the dataset */
k = threadIdx.x + blockIdx.x*blockDim.x;
int local_offset = threadIdx.x;
tmp_sqrerror[local_offset]=0;
/* Special case: If Example is equal to a Cluster Centroid,
then U=1.0 for that cluster and 0 for all others */
if ( (example_is_centroid=is_example_centroid(k,S,C)) != -1 ) {
for(int i=0; i<C; i++)
{
if ( i == example_is_centroid )
U_d[k*C+i]=1.0;
else
U_d[k*C+i]=0.0;
}
return;
}
/* For each class */
for(int i=0; i< C; i++)
{
summation=0;
/* Calculate summation */
for (j=0; j < C; j++) {
D_ki=distance(X_d, V_d,k*DIM,i*S,S);
D_kj=distance(X_d, V_d,k*DIM,j*S,S);
summation += powf( D_ki / D_kj , (2.0/ (m-1)));
}
/* Weight is 1/sum */
newU=1.0/summation;
/* Add to the squareDifference */
tmp_sqrerror[local_offset] += powf(U_d[k*C+i] - newU, 2);
U_d[k*C+i]=newU;
}
__syncthreads();
int t= blockDim.x/2;
while(t>0)
{
if(k+t < N && threadIdx.x<t)
tmp_sqrerror[local_offset] += tmp_sqrerror[local_offset+t];
t/=2;
__syncthreads();
}
if(threadIdx.x==0)
sqrerror[blockIdx.x] = tmp_sqrerror[0];
}
int init()
{
float m = 2.0;
int C=2;
int S=2;
int N=340*340;
int i,j;
/* Allocate necessary storage */
V=(float *)CALLOC(S*C, sizeof(float));
U=(float *)CALLOC(C*N,sizeof(float));
cudaGetErrorString(cudaMalloc(&U_d,N*C*sizeof(float)));
cudaGetErrorString(cudaMalloc(&V_d,C*S*sizeof(float)));
/* Place random values in V, then update U matrix based on it */
srand48(seed);
for (i=0; i < C; i++) {
for (j=0; j < S; j++) {
V[i*S+j]=drand48() * max_value[j];
}
}
float *dummy;
cudaMalloc(&dummy,N*sizeof(float));
cudaGetErrorString(cudaMemcpyToSymbol(&V_d,V,C*S*sizeof(float),0,cudaMemcpyHostToDevice));
/* Once values are populated in V, update the U Matrix for sane values */
update_umatrix<<<(N+DIM-1)/DIM,DIM>>>(dummy,C,N,S,m);
cudaGetErrorString(cudaGetLastError());
cudaDeviceSynchronize();
cudaGetErrorString(cudaMemcpyFromSymbol(U,&U_d,N*C*sizeof(float),cudaMemcpyDeviceToHost));
fprintf(stdout,"Initialization completed.\n");
return 0;
}
如果某个 i 的 X[k] == V[i],则返回那个 i。否则,返回 -1
__device__ int is_example_centroid(int k,int S, int C)
{
int i,x;
for (i=0; i < C; i++) {
for (x=0; x < S; x++) {
if ( X_d[k*DIM+x] != V_d[i*S+x] ) break;
}
if ( x == S ) /* X==V */
return i;
}
return -1;
}
和距离函数:
__device__ float distance(float *v1, float *v2,int startV1,int startV2,int S)
{
int x,i;
float sum=0;
for (x=startV1,i=startV2; x < startV1+DIM && i<startV2+S; x++, i++)
sum += (v1[x] - v2[i]) * (v1[x] - v2[i]);
return sqrt(sum);
}