我从教程中知道,未对齐的加载和存储可能如下所示:
//Load a vector from an unaligned location in memory
__vector unsigned char LoadUnaligned(const unsigned char * src )
{
__vector unsigned char permuteVector = vec_lvsl(0, src);
__vector unsigned char low = vec_ld( 0, src);
__vector unsigned char high = vec_ld( 16, src);
return vec_perm( low, high, permuteVector);
}
//Store a vector to an unaligned location in memory
void StoreUnaligned(__vector unsigned char v, __vector unsigned char * dst)
{
//Load the surrounding area
__vector unsigned char low = vec_ld( 0, dst);
__vector unsigned char high = vec_ld( 16, dst);
//Prepare the constants that we need
__vector unsigned char permuteVector = vec_lvsr( 0, (int*) dst);
__vector signed char oxFF = vec_splat_s8( -1 );
__vector signed char ox00 = vec_splat_s8( 0 );
//Make a mask for which parts of the vectors to swap out
__vector unsigned char mask = vec_perm( ox00, oxFF, permuteVector );
//Right rotate our input data
v = vec_perm( v, v, permuteVector );
//Insert our data into the low and high vectors
low = vec_sel( v, low, mask );
high = vec_sel( high, v, mask );
//Store the two aligned result vectors
vec_st( low, 0, dst);
vec_st( high, 16, dst);
}
看起来很可怕。为了存储一个向量需要做大量的工作!并且有适当的性能损失。
void SomeFuncA(const unsigned char * src, size_t size, unsigned char * dst)
{
for(size_t i = 0; i < size; i += 16)
{
__vector unsigned char a = vec_ld(0, src + i);
//simple work
vec_st(a, 0, dst + i);
}
}
void SomeFuncU(const unsigned char * src, size_t size, unsigned char * dst)
{
for(size_t i = 0; i < size; i += 16)
{
__vector unsigned char a = LoadUnaligned(src + i);
//simple work
StoreUnaligned(dst + i, a);
}
}
第二个功能的工作速度比第一个慢 3-4 倍。由于我无法控制输入和输出内存的对齐,所以我必须实现这两个版本。如何最大程度地减少未对齐案例的性能损失?