我目前正在优化 OpenCL 代码以提高性能,我将以下方法确定为性能瓶颈,我想用等效的 OpenCL-SIMD-Replacement 替换它:
typedef unsigned long uint64_t;
void multiply256(const unsigned int x[8], const unsigned int y[8], unsigned int out_high[8], unsigned int out_low[8])
{
unsigned int z[16];
unsigned int high = 0;
uint64_t product = 0;
// First round, overwrite z
for(int j = 7; j >= 0; j--) {
product = (uint64_t)x[7] * y[j] + high;
z[7 + j + 1] = (unsigned int)product;
high = (unsigned int)(product >> 32);
}
z[7] = high;
for(int i = 6; i >= 0; i--) {
high = 0;
for(int j = 7; j >= 0; j--) {
product = (uint64_t)x[i] * y[j] + z[i + j + 1] + high;
z[i + j + 1] = (unsigned int)product;
high = product >> 32;
}
z[i] = high;
}
for(int i = 0; i < 8; i++) {
out_high[i] = z[i];
out_low[i] = z[8 + i];
}
}
所以我想我可以像这样替换它:
void multiply256(const unsigned int x[8], const unsigned int y[8], unsigned int out_high[8], unsigned int out_low[8])
{
uint8 x8;
x8[0] = x[0];
x8[1] = x[1];
x8[2] = x[2];
x8[3] = x[3];
x8[4] = x[4];
x8[5] = x[5];
x8[6] = x[6];
x8[7] = x[7];
uint8 y8;
y8[0] = y[0];
y8[1] = y[1];
y8[2] = y[2];
y8[3] = y[3];
y8[4] = y[4];
y8[5] = y[5];
y8[6] = y[6];
y8[7] = y[7];
uint8 high = mul_hi(x8, y8);
uint8 low = x8 * y8;
out_high[0] = high[0];
out_high[1] = high[1];
out_high[2] = high[2];
out_high[3] = high[3];
out_high[4] = high[4];
out_high[5] = high[5];
out_high[6] = high[6];
out_high[7] = high[7];
out_low[0] = low[0];
out_low[1] = low[1];
out_low[2] = low[2];
out_low[3] = low[3];
out_low[4] = low[4];
out_low[5] = low[5];
out_low[6] = low[6];
out_low[7] = low[7];
}
但它没有相同的结果。我究竟做错了什么?
谢谢