sorting - DirectX11 基数排序

Question

我正在努力在 DirectX11 中实现高效的基数排序。我需要快速排序不超过 256K 的项目。它不需要就地。使用 CUDA / OpenCL 不是一种选择。

我几乎可以使用它，但是它有一些问题：

直方图的生成速度很快，但仍然比网上引用的数字要长得多
在随后的排序过程中，由于 cp_sort 中直方图缓冲区上的 InterlockedAdd，不保留低位相同的键的顺序（见下文）
cp_sort真的很慢，因为在同一个 InterlockedAdd 上进行全局内存访问

我一直在尝试了解如何根据在线算法解决此问题，但我似乎无法理解它们。

这是我的 3 个内核。（它用于广告牌粒子系统，所以术语“四边形”仅指要排序的项目）

// 8 bits per radix
#define NUM_RADIXES 4
#define RADIX_SIZE 256

// buffers
StructuredBuffer<Quad>   g_particleQuadBuffer   : register( t20 );
RWStructuredBuffer<uint> g_indexBuffer          : register( u0 );
RWStructuredBuffer<uint> g_histogram            : register( u1 );
RWStructuredBuffer<uint> g_indexOutBuffer       : register( u2 );


// quad buffer counter
cbuffer BufferCounter : register( b12 )
{
    uint numQuads;
    uint pad[ 3 ];
}


// on-chip memory for fast histogram calculation
#define SHARED_MEM_PADDING 8 // to try and reduce bank conflicts
groupshared uint g_localHistogram[ NUM_RADIXES * RADIX_SIZE * SHARED_MEM_PADDING ];


// convert a float to a sortable int, assuming all inputs are negative
uint floatToUInt( float input )
{
    return 0xffffffff - asuint( input );
}


// initialise the indices, and build the histograms (dispatched with numQuads / ( NUM_RADIXES * RADIX_SIZE ))
[numthreads( NUM_RADIXES * RADIX_SIZE, 1, 1 )]
void cp_histogram( uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex )
{
    // initialise local histogram
    g_localHistogram[ groupIndex * SHARED_MEM_PADDING ] = 0;
    GroupMemoryBarrierWithGroupSync();

    // check within range
    uint quadIndex = ( groupID.x * NUM_RADIXES * RADIX_SIZE ) + groupIndex;
    if ( quadIndex < numQuads )
    {
        // initialise index
        g_indexBuffer[ quadIndex ] = quadIndex;

        // floating point to sortable uint
        uint value = floatToUInt( g_particleQuadBuffer[ quadIndex ].v[ 0 ].projected.z );

        // build 8-bit histograms
        uint value0 = ( value       ) & 0xff;
        uint value1 = ( value >> 8  ) & 0xff;
        uint value2 = ( value >> 16 ) & 0xff;
        uint value3 = ( value >> 24 );

        InterlockedAdd( g_localHistogram[ ( value0       ) * SHARED_MEM_PADDING ], 1 );
        InterlockedAdd( g_localHistogram[ ( value1 + 256 ) * SHARED_MEM_PADDING ], 1 );
        InterlockedAdd( g_localHistogram[ ( value2 + 512 ) * SHARED_MEM_PADDING ], 1 );
        InterlockedAdd( g_localHistogram[ ( value3 + 768 ) * SHARED_MEM_PADDING ], 1 );
    }

    // write back to histogram
    GroupMemoryBarrierWithGroupSync();
    InterlockedAdd( g_histogram[ groupIndex ], g_localHistogram[ groupIndex * SHARED_MEM_PADDING ] );
}


// build the offsets based on histograms (dispatched with 1)
// NOTE: I know this could be more efficient, but from my profiling, its time is negligible compared to the other 2 stages, and I can optimise this separately using a parallel prefix sum if I need to
[numthreads( NUM_RADIXES, 1, 1 )]
void cp_offsets( uint groupIndex : SV_GroupIndex )
{
    uint sum = 0;
    uint base = ( groupIndex * RADIX_SIZE );
    for ( uint i = 0; i < RADIX_SIZE; i++ )
    {
        uint tempSum = g_histogram[ base + i ] + sum;
        g_histogram[ base + i ] = sum;
        sum = tempSum;
    }
}


// move the data (dispatched with numQuads / ( NUM_RADIXES * RADIX_SIZE ))
uint currentRadix;
[numthreads( NUM_RADIXES * RADIX_SIZE, 1, 1 )]
void cp_sort( uint3 groupID : SV_GroupID, uint groupIndex : SV_GroupIndex )
{
    // check within range
    uint quadIndex = ( groupID.x * NUM_RADIXES * RADIX_SIZE ) + groupIndex;
    if ( quadIndex < numQuads )
    {
        uint fi = g_indexBuffer[ quadIndex ];
        uint depth = floatToUInt( g_particleQuadBuffer[ fi ].v[ 0 ].projected.z );

        uint radix = currentRadix;

        uint pos = ( depth >> ( radix * 8 ) ) & 0xff;

        uint writePosition;
        InterlockedAdd( g_histogram[ radix * RADIX_SIZE + pos ], 1, writePosition );
        g_indexOutBuffer[ writePosition ] = fi;
    }
}

任何人都可以就如何解决/优化这个问题提供任何帮助吗？我很想了解一些更复杂的 GPU 基数排序算法实际上在做什么！

谢谢！

sorting - DirectX11 基数排序

0 回答 0

Related

Reference