您可以尝试矩阵的非线性内存布局以提高缓存利用率。使用 4x4 32 位浮动块,只需对每个缓存行进行一次访问即可进行转置。另外,作为奖励,可以使用 _MM_TRANSPOSE4_PS 轻松完成瓷砖转置。
转置一个非常大的矩阵仍然是非常占用内存的操作。它仍然会受到严重的带宽限制,但至少缓存字负载接近最佳。我不知道性能是否仍然可以优化。我的测试表明,几年前的笔记本电脑可以在大约 300 毫秒内转置 16k*16k(1G 内存)。
我也尝试使用 _mm_stream_sd 但由于某种原因它实际上使性能变差。我对非暂时性内存写入的理解不够深入,无法实际猜测为什么性能会随着 _mm_stream_ps 下降。可能的原因当然是缓存行已经在 L1 缓存中准备好进行写操作。
但实际上具有非线性矩阵的重要部分将有可能完全避免转置并且简单地以平铺友好的顺序运行乘法。但我只有转置代码,用于提高我对算法中缓存管理的了解。
我还没有尝试测试预取是否会提高内存带宽使用率。当前代码以每周期约 0.5 条指令运行(良好的缓存友好代码在此 CPU 上每周期运行约 2 英寸),这为预取指令留下了大量空闲周期,允许进行相当复杂的计算以优化运行时的预取时序。
下面是我的转置基准测试中的示例代码。
#define MATSIZE 16384
#define align(val, a) (val + (a - val % a))
#define tilewidth 4
typedef int matrix[align(MATSIZE,tilewidth)*MATSIZE] __attribute__((aligned(64)));
float &index(matrix m, unsigned i, unsigned j)
{
/* tiled address calculation */
/* single cache line is used for 4x4 sub matrices (64 bytes = 4*4*sizeof(int) */
/* tiles are arranged linearly from top to bottom */
/*
* eg: 16x16 matrix tile positions:
* t1 t5 t9 t13
* t2 t6 t10 t14
* t3 t7 t11 t15
* t4 t8 t12 t16
*/
const unsigned tilestride = tilewidth * MATSIZE;
const unsigned comp0 = i % tilewidth; /* i inside tile is least significant part */
const unsigned comp1 = j * tilewidth; /* next part is j multiplied by tile width */
const unsigned comp2 = i / tilewidth * tilestride;
const unsigned add = comp0 + comp1 + comp2;
return m[add];
}
/* Get start of tile reference */
float &tile(matrix m, unsigned i, unsigned j)
{
const unsigned tilestride = tilewidth * MATSIZE;
const unsigned comp1 = j * tilewidth; /* next part is j multiplied by tile width */
const unsigned comp2 = i / tilewidth * tilestride;
return m[comp1 + comp2];
}
template<bool diagonal>
static void doswap(matrix mat, unsigned i, unsigned j)
{
/* special path to swap whole tile at once */
union {
float *fs;
__m128 *mm;
} src, dst;
src.fs = &tile(mat, i, j);
dst.fs = &tile(mat, j, i);
if (!diagonal) {
__m128 srcrow0 = src.mm[0];
__m128 srcrow1 = src.mm[1];
__m128 srcrow2 = src.mm[2];
__m128 srcrow3 = src.mm[3];
__m128 dstrow0 = dst.mm[0];
__m128 dstrow1 = dst.mm[1];
__m128 dstrow2 = dst.mm[2];
__m128 dstrow3 = dst.mm[3];
_MM_TRANSPOSE4_PS(srcrow0, srcrow1, srcrow2, srcrow3);
_MM_TRANSPOSE4_PS(dstrow0, dstrow1, dstrow2, dstrow3);
#if STREAMWRITE == 1
_mm_stream_ps(src.fs + 0, dstrow0);
_mm_stream_ps(src.fs + 4, dstrow1);
_mm_stream_ps(src.fs + 8, dstrow2);
_mm_stream_ps(src.fs + 12, dstrow3);
_mm_stream_ps(dst.fs + 0, srcrow0);
_mm_stream_ps(dst.fs + 4, srcrow1);
_mm_stream_ps(dst.fs + 8, srcrow2);
_mm_stream_ps(dst.fs + 12, srcrow3);
#else
src.mm[0] = dstrow0;
src.mm[1] = dstrow1;
src.mm[2] = dstrow2;
src.mm[3] = dstrow3;
dst.mm[0] = srcrow0;
dst.mm[1] = srcrow1;
dst.mm[2] = srcrow2;
dst.mm[3] = srcrow3;
#endif
} else {
__m128 srcrow0 = src.mm[0];
__m128 srcrow1 = src.mm[1];
__m128 srcrow2 = src.mm[2];
__m128 srcrow3 = src.mm[3];
_MM_TRANSPOSE4_PS(srcrow0, srcrow1, srcrow2, srcrow3);
#if STREAMWRITE == 1
_mm_stream_ps(src.fs + 0, srcrow0);
_mm_stream_ps(src.fs + 4, srcrow1);
_mm_stream_ps(src.fs + 8, srcrow2);
_mm_stream_ps(src.fs + 12, srcrow3);
#else
src.mm[0] = srcrow0;
src.mm[1] = srcrow1;
src.mm[2] = srcrow2;
src.mm[3] = srcrow3;
#endif
}
}
}
static void transpose(matrix mat)
{
const unsigned xstep = 256;
const unsigned ystep = 256;
const unsigned istep = 4;
const unsigned jstep = 4;
unsigned x1, y1, i, j;
/* need to increment x check for y limit to allow unrolled inner loop
* access entries close to diagonal axis
*/
for (x1 = 0; x1 < MATSIZE - xstep + 1 && MATSIZE > xstep && xstep; x1 += xstep)
for (y1 = 0; y1 < std::min(MATSIZE - ystep + 1, x1 + 1); y1 += ystep)
for ( i = x1 ; i < x1 + xstep; i += istep ) {
for ( j = y1 ; j < std::min(y1 + ystep, i); j+= jstep )
{
doswap<false>(mat, i, j);
}
if (i == j && j < (y1 + ystep))
doswap<true>(mat, i, j);
}
for ( i = 0 ; i < x1; i += istep ) {
for ( j = y1 ; j < std::min(MATSIZE - jstep + 1, i); j+= jstep )
{
doswap<false>(mat, i, j);
}
if (i == j)
doswap<true>(mat, i, j);
}
for ( i = x1 ; i < MATSIZE - istep + 1; i += istep ) {
for ( j = y1 ; j < std::min(MATSIZE - jstep + 1, i); j+= jstep )
{
doswap<false>(mat, i, j);
}
if (i == j)
doswap<true>(mat, i, j);
}
x1 = MATSIZE - MATSIZE % istep;
y1 = MATSIZE - MATSIZE % jstep;
for ( i = x1 ; i < MATSIZE; i++ )
for ( j = 0 ; j < std::min((unsigned)MATSIZE, i); j++ )
std::swap(index(mat, i, j+0), index(mat, j+0, i));
for ( i = 0; i < x1; i++ )
for ( j = y1 ; j < std::min((unsigned)MATSIZE, i) ; j++ )
std::swap(index(mat, i, j+0), index(mat, j+0, i));
}