我有两个数组说'fa'和'tempxyz'。我需要从另一个中减去一个并将其存储在另一个数组中。我正在使用流媒体商店。所以我需要有对齐的访问权限。我对齐了这两个数组以及第三个数组。我仍然得到一个段。过错。对于流式存储,数组应该是 64 字节对齐的。这是否意味着数组的每个元素都应该相隔 64 个字节,以便每个元素的地址都是 64 的倍数?我在下面给出了我的代码片段。请帮帮我。
main()
{
double *force = ( double * ) _mm_malloc ( (nd * np )* sizeof ( double ),64);
// np can be any number (np=1000, 2000, etc.)
// nd = 3
__declspec(align(64)) double array[np*nd];
compute (force, array);
}
void compute (double *f double array[np*nd])
{
__declspec(align(64)) double fa[8], tempxyz[8];
for(k=0;k<np;k++)
{
__assume_aligned(f,64);
__assume((k*nd) % 8 == 0);
for ( i = 0; i < nd; i++ )
{
f[i+k*nd] = 0.0;
}
// Doing some computation on array and storing it in fa.
fa[0] = array[k*nd+0];
fa[1] = array[k*nd+1];
fa[2] = array[k*nd+2];
__m512d y1, y2, y3;
__assume_aligned(&fa,64);
__assume_aligned(&tempxyz,64);
// Want to load 3 elements at a time, subtract all the three
// and store it at a memory location.
y1 = _mm512_load_pd(fa);
y2 = _mm512_load_pd(tempxyz);
y3 = _mm512_sub_pd(y1,y2);
__assume_aligned(f,64);
__assume((k*nd) % 8 == 0); // Here nd=3 and k is loop index variable.
_mm512_storenr_pd((f+k*nd), y3); // streaming store instruction
// --- GIVING SEG. FAULT !!!
} // end of k loop
}// end of compute function