所以我的线程代码是:
DWORD WINAPI ThreadFunc1(LPVOID lpParam )
{
THREAD_DATA *ptrDat = (THREAD_DATA *)(lpParam);
int loc_N = ptrDat->loc_N ;
int ntimes = ptrDat->ntimes;
__m128d rx0, ry0, result0;
for( int ip= 0; ip < ntimes; ip++ ) {
result0 = _mm_setzero_pd();
if (loc_N%2 != 0){
rx0 = _mm_load_sd(ptrDat->X);
ry0 = _mm_load_sd(ptrDat->Y);
ry0 = _mm_mul_pd(rx0, ry0);
result0 = _mm_add_pd(result0, ry0);
}
for( int i = loc_N%2; i < loc_N; i+=2 ) {
rx0 = _mm_load_pd(ptrDat->X+i);
ry0 = _mm_load_pd(ptrDat->Y+i);
ry0 = _mm_mul_pd(rx0, ry0);
result0 = _mm_add_pd(result0, ry0);
}
_mm_storeh_pd ( &ptrDat->res, _mm_add_pd(result0, _mm_shuffle_pd( result0, result0,1 )));
}
ptrDat->ret = 0;
return 0;
}
有片段主要功能:
loc_N = N/np;
N = loc_N*np;
try
{
X = new double[N];
Y = new double[N];
}
catch(bad_alloc aa)
{
cout << "memory allocation error\n";
system("pause");
exit(1);
}
//preparation of X, Y
int i;
for(i=0; i<N; i++)
{
X[i] = (double)(i+1);
Y[i] = 1.0;
}
for(ip=0; ip<np; ip++)
{
tDat[ip].loc_N = loc_N;
tDat[ip].N = N;
tDat[ip].ntimes = ntimes;
tDat[ip].X = X + ip*loc_N;
tDat[ip].Y = Y + ip*loc_N;
tDat[ip].threadNo = ip;
hThread[ip] = CreateThread(
NULL,
0,
ThreadFunc1,
(void*)&tDat[ip],
0,
NULL
);
if( !hThread[ip] ) {
exit(1);
}
}
loc_N 是每个线程的元素数。向量 x 和 y 的 N 大小。ntimes - 是多次重复算法。threadNo - 线程号。
我不知道为什么当我有三个线程时程序在 ry0 = _mm_load_pd (ptrDat-> y + i); 上崩溃了