我试图了解严格混叠对 C99 性能的影响。我的目标是优化矢量点积,这在我的程序中占用了大量时间(分析它!)。我认为混叠可能是问题所在,但以下代码并未显示标准方法和严格混叠版本之间的任何实质性差异,即使向量大小为 1 亿。我还尝试使用局部变量来避免别名,结果相似。
发生了什么?
我在 OSX 10.7.4 上使用 gcc-4.7。结果以微秒为单位。
$ /usr/local/bin/gcc-4.7 -fstrict-aliasing -Wall -std=c99 -O3 -o restrict restrict.c
$ ./restrict
sum: 100000000 69542
sum2: 100000000 70432
sum3: 100000000 70372
sum4: 100000000 69891
$ /usr/local/bin/gcc-4.7 -Wall -std=c99 -O0 -fno-strict-aliasing -o restrict restrict.c
$ ./restrict
sum: 100000000 258487
sum2: 100000000 261349
sum3: 100000000 258829
sum4: 100000000 258129
restrict.c(注意此代码将需要数百 MB RAM):
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
/* original */
long sum(int *x, int *y, int n)
{
long i, s = 0;
for(i = 0 ; i < n ; i++)
s += x[i] * y[i];
return s;
}
/* restrict */
long sum2(int *restrict x, int *restrict y, int n)
{
long i, s = 0;
for(i = 0 ; i < n ; i++)
s += x[i] * y[i];
return s;
}
/* local restrict */
long sum3(int *x, int *y, int n)
{
int *restrict xr = x;
int *restrict yr = y;
long i, s = 0;
for(i = 0 ; i < n ; i++)
s += xr[i] * yr[i];
return s;
}
/* use local variables */
long sum4(int *x, int *y, int n)
{
int xr, yr;
long i, s = 0;
for(i = 0 ; i < n ; i++)
{
xr = x[i];
yr = y[i];
s += xr * yr;
}
return s;
}
int main(void)
{
struct timeval tp1, tp2;
struct timezone tzp;
long i, n = 1e8L, s;
int *x = malloc(sizeof(int) * n);
int *y = malloc(sizeof(int) * n);
long elapsed1;
for(i = 0 ; i < n ; i++)
x[i] = y[i] = 1;
gettimeofday(&tp1, &tzp);
s = sum(x, y, n);
gettimeofday(&tp2, &tzp);
elapsed1 = (tp2.tv_sec - tp1.tv_sec) * 1e6
+ (tp2.tv_usec - tp1.tv_usec);
printf("sum:\t%ld\t%ld\n", s, elapsed1);
gettimeofday(&tp1, &tzp);
s = sum2(x, y, n);
gettimeofday(&tp2, &tzp);
elapsed1 = (tp2.tv_sec - tp1.tv_sec) * 1e6
+ (tp2.tv_usec - tp1.tv_usec);
printf("sum2:\t%ld\t%ld\n", s, elapsed1);
gettimeofday(&tp1, &tzp);
s = sum3(x, y, n);
gettimeofday(&tp2, &tzp);
elapsed1 = (tp2.tv_sec - tp1.tv_sec) * 1e6
+ (tp2.tv_usec - tp1.tv_usec);
printf("sum3:\t%ld\t%ld\n", s, elapsed1);
gettimeofday(&tp1, &tzp);
s = sum3(x, y, n);
gettimeofday(&tp2, &tzp);
elapsed1 = (tp2.tv_sec - tp1.tv_sec) * 1e6
+ (tp2.tv_usec - tp1.tv_usec);
printf("sum4:\t%ld\t%ld\n", s, elapsed1);
return EXIT_SUCCESS;
}