转换:
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
x[i] += a[i] + a[j];
进入:
for (i = 0; i < n; i++)
x[i] += n*a[i] + sum(a));
请参阅f_sse()
下面的代码:
#include <stdio.h>
#include <string.h>
#include <immintrin.h>
enum {
N = 4,
};
float x[N], a[N] = { .1, .2, .3, .4 }, y[N];
void f(float *x, float *a, int n)
{
int i, j;
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
x[i] += a[i] + a[j];
}
float array_sum(float *a, int n)
{
/* could be vectorized as well */
int i;
float s;
for (s = 0, i = 0; i < n; i++)
s += a[i];
return s;
}
void f_sse(float *x, float *a, int n)
{
int i, l;
float t;
__m128 sum_a, n_vec;
t = array_sum(a, n);
sum_a = _mm_set1_ps(t);
n_vec = _mm_set1_ps(n);
l = n / 4;
for (i = 0; i < l; i += 4) {
__m128 ai, xi;
ai = _mm_load_ps(a + i);
xi = _mm_load_ps(x + i);
ai = _mm_mul_ps(ai, n_vec);
ai = _mm_add_ps(ai, sum_a);
xi = _mm_add_ps(xi, ai);
_mm_store_ps(x + i, xi);
}
}
int main()
{
int i, r;
f(x, a, N);
f_sse(y, a, N);
r = memcmp(x, y, N);
if (r == 0)
return 0;
printf("x: { ");
for (i = 0; i < N; i++)
printf("%f, ", x[i]);
printf("}\n");
printf("y: { ");
for (i = 0; i < N; i++)
printf("%f, ", y[i]);
printf("}\n");
return 3;
}
由于您声明a
正在同时更新并且您不能进行上面的循环转换。您需要有意识地决定何时从a
. 它永远不会匹配原始的非矢量化版本,因为我们一次加载 4 个浮点数。
a[j]
在内循环中重新加载:
void f_sse(float *x, float *a, int n)
{
int i, j, l;
l = n / 4;
for (i = 0; i < l; i += 4) {
__m128 ai, xi;
ai = _mm_load_ps(a + i);
xi = _mm_load_ps(x + i);
for (j = 0; j < n; j++) {
/* re-load a[j] to get updates */
xi = _mm_add_ps(xi, _mm_add_ps(ai, _mm_set1_ps(((volatile float *)a)[j])));
}
_mm_store_ps(x + i, xi);
}
}
重新加载a[j]
并(a + i)
在内循环中:
void f_sse(float *x, float *a, int n)
{
int i, j, l;
l = n / 4;
for (i = 0; i < l; i += 4) {
__m128 ai, xi;
xi = _mm_load_ps(x + i);
for (j = 0; j < n; j++) {
/* re-load (a + i) to get updates */
ai = _mm_load_ps(a + i);
/* re-load a[j] to get updates */
xi = _mm_add_ps(xi, _mm_add_ps(ai, _mm_set1_ps(((volatile float *)a)[j])));
}
_mm_store_ps(x + i, xi);
}
}