我写了一些代码来对我的想法做一个简单的测试:
// Compile using gcc -O4 main.c && objdump -d a.out
void add4(float x0, float x1, float x2, float x3,
float y0, float y1, float y2, float y3,
float* out0, float* out1, float* out2, float* out3) {
// Non-inlined version of this uses xmm registers and four separate
// SIMD operations
*out0 = x0 + y0;
*out1 = x1 + y1;
*out2 = x2 + y2;
*out3 = x3 + y3;
}
void sub4(float x0, float x1, float x2, float x3,
float y0, float y1, float y2, float y3,
float* out0, float* out1, float* out2, float* out3) {
*out0 = x0 - y0;
*out1 = x1 - y1;
*out2 = x2 - y2;
*out3 = x3 - y3;
}
void add4_then_sub4(float x0, float x1, float x2, float x3,
float y0, float y1, float y2, float y3,
float z0, float z1, float z2, float z3,
float* out0, float* out1, float* out2, float* out3) {
// In non-inlined version of this, add4 and sub4 get inlined.
// xmm regiesters get re-used for the add and subtract,
// but there is still no 4-way SIMD
float temp0,temp1,temp2,temp3;
// temp= x + y
add4(x0,x1,x2,x3,
y0,y1,y2,y3,
&temp0,&temp1,&temp2,&temp3);
// out = temp - z
sub4(temp0,temp1,temp2,temp3,
z0,z1,z2,z3,
out0,out1,out2,out3);
}
void add4_then_sub4_arrays(const float x[4],
const float y[4],
const float z[4],
float out[4])
{
// This is a stand-in for the main function below, but since the arrays are aguments,
// they can't be optimized out of the non-inlined version of this function.
// THIS version DOES compile into (I think) a bunch of non-aligned moves,
// and a single vectorized add a single vectorized subtract
add4_then_sub4(x[0],x[1],x[2],x[3],
y[0],y[1],y[2],y[3],
z[0],z[1],z[2],z[3],
&out[0],&out[1],&out[2],&out[3]
);
}
int main(int argc, char **argv)
{
}
考虑为 add4_then_sub4_arrays 生成的程序集:
0000000000400600 <add4_then_sub4_arrays>:
400600: 0f 57 c0 xorps %xmm0,%xmm0
400603: 0f 57 c9 xorps %xmm1,%xmm1
400606: 0f 12 06 movlps (%rsi),%xmm0
400609: 0f 12 0f movlps (%rdi),%xmm1
40060c: 0f 16 46 08 movhps 0x8(%rsi),%xmm0
400610: 0f 16 4f 08 movhps 0x8(%rdi),%xmm1
400614: 0f 58 c1 addps %xmm1,%xmm0
400617: 0f 57 c9 xorps %xmm1,%xmm1
40061a: 0f 12 0a movlps (%rdx),%xmm1
40061d: 0f 16 4a 08 movhps 0x8(%rdx),%xmm1
400621: 0f 5c c1 subps %xmm1,%xmm0
400624: 0f 13 01 movlps %xmm0,(%rcx)
400627: 0f 17 41 08 movhps %xmm0,0x8(%rcx)
40062b: c3 retq
40062c: 0f 1f 40 00 nopl 0x0(%rax)
数组没有对齐,因此移动操作比理想的要多得多,我不确定 xor 在那里做了什么,但确实有一个 4 路加法和一个 4 路减法。
所以答案是 gcc 至少有一些能力将标量浮点操作打包回 SIMD 操作。
更新:更严格的代码gcc-4.8 -O3 -march=native main.c && objdump -d a.out
:
0000000000400600 <add4_then_sub4_arrays>:
400600: c5 f8 10 0e vmovups (%rsi),%xmm1
400604: c5 f8 10 07 vmovups (%rdi),%xmm0
400608: c5 f0 58 c0 vaddps %xmm0,%xmm1,%xmm0
40060c: c5 f8 10 0a vmovups (%rdx),%xmm1
400610: c5 f8 5c c1 vsubps %xmm1,%xmm0,%xmm0
400614: c5 f8 11 01 vmovups %xmm0,(%rcx)
400618: c3 retq
400619: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
和clang-4.0 -O3 -march=native main.c && llvm-objdump -d a.out
:
add4_then_sub4_arrays:
4005e0: c5 f8 10 07 vmovups (%rdi), %xmm0
4005e4: c5 f8 58 06 vaddps (%rsi), %xmm0, %xmm0
4005e8: c5 f8 5c 02 vsubps (%rdx), %xmm0, %xmm0
4005ec: c5 f8 11 01 vmovups %xmm0, (%rcx)
4005f0: c3 ret
4005f1: 66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax)