我正在 GPU 上实现 k-means,现在我有以下代码:
__device__ unsigned int cuda_delta = 0;
__global__ void kmeans_kernel(const sequence_t *data,
const sequence_t *centroids,
int * membership,
unsigned int n,
unsigned int numClusters )
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n){
int min_distance = INT_MAX;
int nearest = -1;
for (int i = 0; i < numClusters; i++){
sequence_t centroid = centroids[i];
int distance = distance(centroid, data[index]);
if(distance < min_distance) {
nearest = i;
min_distance = distance;
}
}
if(membership[index] != nearest) {
membership[index]=nearest;
atomicInc(&cuda_delta,n);
}
}
可以看到,算法没有数据依赖,只有变量cuda_delta,存储在全局内存中。根据 com 文档:
原子函数对驻留在全局或共享内存中的一个 32 位或 64 位字执行读-修改-写原子操作
这正是我所需要的。 编辑 - 这是我所有的主机代码
unsigned int delta=0; //Number of objects has diverged in current iteration
label = (int*)calloc(data_size,sizeof(int));
centroids = (sequence_t*)calloc(clusters,sizeof(sequence_t));
// cuda variables
sequence_t * cuda_data = NULL;
sequence_t * cuda_centroids = NULL;
int *cuda_membership = NULL;
unsigned int *cuda_tmp_centroidCount = NULL;
const unsigned int threadsPerBlock = 1024;
const unsigned int numBlocks = (data_size + threadsPerBlock - 1) / threadsPerBlock;
const unsigned int numBlocks2 = (clusters + threadsPerBlock - 1) / threadsPerBlock;
for(unsigned int i = 0;i < clusters;i++) {
int h = i * data_size / clusters;
centroids[i] = make_ulong3(data[h].x,data[h].y,data[h].z);
}
memset (label,-1,data_size * sizeof(int));
checkCuda(cudaMalloc(&cuda_data, data_size * sizeof(sequence_t)));
checkCuda(cudaMalloc(&cuda_centroids, clusters * sizeof(sequence_t)));
checkCuda(cudaMalloc(&cuda_membership, data_size * sizeof(int)));
checkCuda(cudaMalloc(&cuda_tmp_centroidCount, clusters * BIT_SIZE_OF(sequence_t) *sizeof(unsigned int)));
checkCuda(cudaMemcpy(cuda_data,data, data_size *sizeof(sequence_t) , cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(cuda_centroids, centroids, clusters *sizeof(sequence_t) , cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(cuda_membership, label, clusters *sizeof(int) , cudaMemcpyHostToDevice));
int pc = 0;
do {
cudaMemset (cuda_tmp_centroidCount,0,clusters * BIT_SIZE_OF(sequence_t) *sizeof(unsigned int));
delta = 0;
checkCuda(cudaMemcpyToSymbol(cuda_delta, &delta,sizeof(unsigned int),0,cudaMemcpyHostToDevice));
kmeans_kernel <<< numBlocks,threadsPerBlock>>>(cuda_data,
cuda_centroids,
cuda_membership,
data_size,
clusters);
cudaDeviceSynchronize();
checkCuda(cudaMemcpyFromSymbol(&delta,cuda_delta,sizeof(unsigned int)));
printf ("%d - delta = %d\n",pc,delta);
checkCuda(cudaGetLastError());
pc++;
}
while(delta > 0);
// copy output
checkCuda(cudaMemcpy(label,cuda_membership, clusters *sizeof(int) , cudaMemcpyDeviceToHost));
checkCuda(cudaMemcpy(centroids,cuda_centroids, clusters *sizeof(sequence_t) , cudaMemcpyDeviceToHost));
// free cuda memory
checkCuda(cudaFree(cuda_data));
checkCuda(cudaFree(cuda_centroids));
checkCuda(cudaFree(cuda_membership));
checkCuda(cudaFree(cuda_tmp_centroidCount));
checkCuda(cudaDeviceReset());
如果我多次运行代码,第一次迭代打印的 delta 值会发生变化,它不应该。大多数时候打印的值是:
0 - delta = 18630
0 - delta = 859
预期值为 18634。我在这里遗漏了什么吗?
编辑完整代码可在github上找到,运行示例只需使用 make 编译即可。并使用以下参数多次运行程序,您将看到第一次迭代的增量值并不总是预期的。
./cuda-means mus_musmusculus.dat 859
先谢谢了!