0

我正在 GPU 上实现 k-means,现在我有以下代码:

__device__ unsigned int cuda_delta = 0;

__global__ void kmeans_kernel(const sequence_t *data,
                          const sequence_t *centroids,
                          int * membership,
                          unsigned int n,
                          unsigned int numClusters )
{
int index = blockIdx.x * blockDim.x  + threadIdx.x;
if (index < n){

    int min_distance = INT_MAX;
    int nearest = -1;

    for (int i = 0; i < numClusters; i++){
        sequence_t centroid = centroids[i];
        int distance = distance(centroid, data[index]);
        if(distance < min_distance) {
            nearest = i;
            min_distance = distance;
        }
    }

    if(membership[index] != nearest) {
        membership[index]=nearest;
        atomicInc(&cuda_delta,n);
    }
}

可以看到,算法没有数据依赖,只有变量cuda_delta,存储在全局内存中。根据 com 文档:

原子函数对驻留在全局或共享内存中的一个 32 位或 64 位字执行读-修改-写原子操作

这正是我所需要的。 编辑 - 这是我所有的主机代码

unsigned int delta=0; //Number of objects has diverged in current iteration

label = (int*)calloc(data_size,sizeof(int));
centroids = (sequence_t*)calloc(clusters,sizeof(sequence_t));

// cuda variables
sequence_t * cuda_data = NULL;
sequence_t * cuda_centroids = NULL;
int *cuda_membership = NULL;
unsigned int *cuda_tmp_centroidCount = NULL;

const unsigned int threadsPerBlock = 1024;
const unsigned int numBlocks = (data_size + threadsPerBlock - 1) / threadsPerBlock;
const unsigned int numBlocks2 = (clusters + threadsPerBlock - 1) / threadsPerBlock;

for(unsigned int i = 0;i < clusters;i++) {
    int h = i * data_size / clusters;
    centroids[i] = make_ulong3(data[h].x,data[h].y,data[h].z);
}

memset (label,-1,data_size * sizeof(int));

checkCuda(cudaMalloc(&cuda_data, data_size * sizeof(sequence_t)));
checkCuda(cudaMalloc(&cuda_centroids, clusters * sizeof(sequence_t)));
checkCuda(cudaMalloc(&cuda_membership, data_size * sizeof(int)));
checkCuda(cudaMalloc(&cuda_tmp_centroidCount, clusters * BIT_SIZE_OF(sequence_t) *sizeof(unsigned int)));


checkCuda(cudaMemcpy(cuda_data,data, data_size *sizeof(sequence_t) , cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(cuda_centroids, centroids, clusters *sizeof(sequence_t) , cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(cuda_membership, label, clusters *sizeof(int) , cudaMemcpyHostToDevice));
int pc = 0;

do {

    cudaMemset (cuda_tmp_centroidCount,0,clusters * BIT_SIZE_OF(sequence_t) *sizeof(unsigned int));
    delta = 0;
    checkCuda(cudaMemcpyToSymbol(cuda_delta, &delta,sizeof(unsigned int),0,cudaMemcpyHostToDevice));
    kmeans_kernel <<< numBlocks,threadsPerBlock>>>(cuda_data,
                                                   cuda_centroids,
                                                   cuda_membership,
                                                   data_size,
                                                   clusters); 
    cudaDeviceSynchronize();
    checkCuda(cudaMemcpyFromSymbol(&delta,cuda_delta,sizeof(unsigned int)));
    printf ("%d - delta = %d\n",pc,delta);
    checkCuda(cudaGetLastError());
    pc++;
}
while(delta > 0);
// copy output
checkCuda(cudaMemcpy(label,cuda_membership, clusters *sizeof(int) , cudaMemcpyDeviceToHost));
checkCuda(cudaMemcpy(centroids,cuda_centroids, clusters *sizeof(sequence_t) , cudaMemcpyDeviceToHost));

// free cuda memory
checkCuda(cudaFree(cuda_data));
checkCuda(cudaFree(cuda_centroids));
checkCuda(cudaFree(cuda_membership));
checkCuda(cudaFree(cuda_tmp_centroidCount));
checkCuda(cudaDeviceReset());

如果我多次运行代码,第一次迭代打印的 delta 值会发生变化,它不应该。大多数时候打印的值是:

0 - delta = 18630
0 - delta = 859

预期值为 18634。我在这里遗漏了什么吗?

编辑完整代码可在github上找到,运行示例只需使用 make 编译即可。并使用以下参数多次运行程序,您将看到第一次迭代的增量值并不总是预期的。

./cuda-means mus_musmusculus.dat 859

先谢谢了!

4

2 回答 2

1
cudaMemcpyToSymbol(cuda_delta, &delta,sizeof(unsigned int));

cudaMemcpyFromSymbol(&delta,cuda_delta,sizeof(unsigned int));

是你的问题。

从文档中:

    cudaError_t cudaMemcpyFromSymbol ( void* dst, const void* symbol, size_t count, size_t offset = 0, cudaMemcpyKind kind = cudaMemcpyDeviceToHost )
Copies data from the given symbol on the device.
    Parameters

dst
    - Destination memory address 
symbol
    - Device symbol address 
count
    - Size in bytes to copy 
offset
    - Offset from start of symbol in bytes 
kind
    - Type of transfer

cudaMemcpyFromSymbol期望地址符号作为第二个参数而不是设备符号。

您可以使用以下方式选择符号的地址cudaGetSymbolAddress ( void** devPtr, const void* symbol )

void*是纯粹的邪恶...

于 2013-09-18T05:48:56.110 回答
0

真丢人!原子操作运行良好。

我不是“memseting”成员数组。在我修复它之后,一切正常。

于 2013-09-19T20:19:02.823 回答