2

我有一个非常简单的算法,可以计算两个矩阵的相应行之间的平方欧几里得距离。我有以下代码,但不幸的是它没有为不同的矩阵大小返回正确的结果。更具体地说,它适用于大小为2000x4, 500x4, 2500x2, 600x8,的矩阵1000x8100x8但不适用于大小为2500x3, 2500x5, 400x3, 100x3, 100x10, 1000x10, 1000x12,500x12的矩阵500x14

有谁能够帮我?我想手动完成,不使用任何优化库,因为我想了解线程管理。

__global__ void cudaEuclid( float* A, float* B, float* C, int rows, int cols )
    {
        int i, squareeucldist = 0;
        int r = blockDim.x * blockIdx.x + threadIdx.x; // rows
        int c = blockDim.y * blockIdx.y + threadIdx.y; // cols 
        extern __shared__ float sdata[];
        //int r = blockIdx.y; int c = threadIdx.x;
        if( r < rows && c < cols  ){

            //C[r + rows*c] = ( A[r + rows*c] - B[r + rows*c] ) * ( A[r + rows*c] - B[r + rows*c] );


            sdata[threadIdx.x] = ( A[r + rows*c] - B[r + rows*c] ) * ( A[r + rows*c] - B[r + rows*c] );

            __syncthreads();

            // contiguous range pattern
            for(int offset = blockDim.x / 2;
                offset > 0;
                offset >>= 1)
            {
                if(threadIdx.x < offset)
                {
                    // add a partial sum upstream to our own
                    sdata[threadIdx.x] += sdata[threadIdx.x + offset];
                }

                // wait until all threads in the block have
                // updated their partial sums
                __syncthreads();
            }

            // thread 0 writes the final result
            if(threadIdx.x == 0)
            {
                C[r] = sdata[0];
            }

        }

    }

内核调用是:

dim3 dimBlock( cols, 1 ); 
dim3 dimGrid( 1, rows ); 
cudaEuclid<<<dimGrid, cols, cols*sizeof(float)>>>( d_A, d_B, d_C, rows, cols );

PS:我想提一下,我发布了一个类似的问题,但从一开始就不清楚,讨论也很混乱。尽管 Tom 提出了一个非常有用的建议,即它在未来对优化实现非常实用,但我需要更多手工制作的东西。最后,我发这个帖子的原因是我不想让相关帖子变得更复杂。谢谢。

4

2 回答 2

1

实际上,您的代码仅在m * 2^n足够n小的情况下才有效。您可能想更仔细地阅读第 14 页上的以下幻灯片,

http://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf

并思考以下问题

  1. 当你blockDim.x等于 3 或 5 时会发生什么;
  2. blockDim.x当或cols不是 2 的幂时,如何正确地进行并行归约;
  3. 为什么减少的结果比预期的要小;
  4. 哪些元素sdata[]未添加到最终总和中;
  5. 如果您在5blockDim.x时将 smem 的大小设置为 2^3,结果是否正确?cols
  6. 在 q5 的情况下,如何处理额外的 3 元素空间smem[5..7]

尝试用笔和纸逐步模拟运行 for 循环会有所帮助。

于 2013-10-11T18:56:22.727 回答
1

尽管 OP 不想使用优化的库来回答他的问题,但该帖子有一个有用的标题,其他用户可以发现它在没有手写内核的情况下解决问题很有用。

我很好奇,并考虑了使用 CUDA Thrust 的问题。我最终得到了下面的代码,它使用 . 来计算两个矩阵的同源行之间的距离thrust::reduce_by_key

#include <thrust\device_vector.h>
#include <thrust\transform_reduce.h>
#include <thrust\sequence.h>
#include <thrust\random.h>
#include <thrust\gather.h>
#include <thrust\extrema.h>

using namespace thrust::placeholders;

/****************************************************/
/* POWER DIFFERENCE FUNCTOR FOR EUCLIDEAN DISTANCES */
/****************************************************/
struct PowerDifference {
    __host__ __device__ float operator()(const float& a, const float& b) const { return pow(a - b, 2); }
};

/*******************/
/* EXPAND OPERATOR */
/*******************/
template <typename InputIterator1, typename InputIterator2, typename OutputIterator>
OutputIterator expand(InputIterator1 first1,
                      InputIterator1 last1,
                      InputIterator2 first2,
                      OutputIterator output)
{
    typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;

    difference_type input_size  = thrust::distance(first1, last1);
    difference_type output_size = thrust::reduce(first1, last1);

    // scan the counts to obtain output offsets for each input element
    thrust::device_vector<difference_type> output_offsets(input_size, 0);
    thrust::exclusive_scan(first1, last1, output_offsets.begin()); 

    // scatter the nonzero counts into their corresponding output positions
    thrust::device_vector<difference_type> output_indices(output_size, 0);
    thrust::scatter_if(thrust::counting_iterator<difference_type>(0), thrust::counting_iterator<difference_type>(input_size),
                       output_offsets.begin(), first1, output_indices.begin());

    // compute max-scan over the output indices, filling in the holes
    thrust::inclusive_scan(output_indices.begin(), output_indices.end(), output_indices.begin(), thrust::maximum<difference_type>());

    // gather input values according to index array (output = first2[output_indices])
    OutputIterator output_end = output; thrust::advance(output_end, output_size);
    thrust::gather(output_indices.begin(), output_indices.end(), first2, output);

    // return output + output_size
    thrust::advance(output, output_size);

    return output;
}

/********/
/* MAIN */
/********/
int main()
{
    /**************************/
    /* SETTING UP THE PROBLEM */
    /**************************/

    const int N     = 10;           // --- Number of vector elements
    const int Nvec  = 20;           // --- Number of vectors for each matrix

    // --- Random uniform integer distribution between 0 and 100
    thrust::default_random_engine rng;
    thrust::uniform_int_distribution<int> dist(0, 20);

    // --- Matrix allocation and initialization
    thrust::device_vector<float> d_matrix1(Nvec * N);
    thrust::device_vector<float> d_matrix2(Nvec * N);
    for (size_t i = 0; i < d_matrix1.size(); i++) d_matrix1[i] = (float)dist(rng);
    for (size_t i = 0; i < d_matrix2.size(); i++) d_matrix2[i] = (float)dist(rng);

    printf("\n\nFirst matrix\n");
    for(int i = 0; i < Nvec; i++) {
        std::cout << " [ ";
        for(int j = 0; j < N; j++)
            std::cout << d_matrix1[i * N + j] << " ";
        std::cout << "]\n";
    }

    printf("\n\nSecond matrix\n");
    for(int i = 0; i < Nvec; i++) {
        std::cout << " [ ";
        for(int j = 0; j < N; j++)
            std::cout << d_matrix2[i * N + j] << " ";
        std::cout << "]\n";
    }

    /****************************************************************************/
    /* CALCULATING THE EUCLIDEAN DISTANCES BETWEEN THE ROWS OF THE TWO MATRICES */
    /****************************************************************************/
    // --- Creating the indices for the reduction by key
    thrust::device_vector<int> d_sequence(Nvec);
    thrust::device_vector<int> d_indices(Nvec * N);
    thrust::device_vector<int> d_counts(Nvec, N);
    thrust::sequence(d_sequence.begin(), d_sequence.begin() + Nvec);
    expand(d_counts.begin(), d_counts.end(), d_sequence.begin(), d_indices.begin());

    printf("\n\nSecond matrix\n");
    for(int i = 0; i < Nvec; i++) {
        std::cout << " [ ";
        for(int j = 0; j < N; j++)
            std::cout << d_indices[i * N + j] << " ";
        std::cout << "]\n";
    }

    thrust::device_vector<float> d_squared_differences(Nvec * N);

    thrust::transform(d_matrix1.begin(), d_matrix1.end(), d_matrix2.begin(), d_squared_differences.begin(), PowerDifference());

    thrust::device_vector<float> d_norms(Nvec);
    thrust::reduce_by_key(d_indices.begin(), d_indices.end(), d_squared_differences.begin(), d_indices.begin(), d_norms.begin());

    printf("\n\ndnorms\n");
    for(int i = 0; i < Nvec; i++) {
            std::cout << d_norms[i] << " ";
    }

    return 0; 
}
于 2015-06-25T17:26:58.287 回答