让我用一个陈述来断言这个问题;此代码按预期工作,但它的速度非常慢。有没有办法让牛顿法收敛得更快,或者设置一个 __m256 var 等于一个浮点数而不弄乱浮点数数组等?
__m256 nthRoot(__m256 a, int root){
#define aligned __declspec(align(16)) float
// uses the calculation
// n_x+1 = (1/root)*(root * x + a / pow(x,root))
//initial numbers
aligned r[8];
aligned iN[8];
aligned mN[8];
//Function I made to fill arrays
/*
template<class T>
void FillArray(T a[],T b)
{
int n = sizeof(a)/sizeof(T);
for(int i = 0; i < n; a[i++] = b);
}*/
//fills the arrays
FillArray(iN,(1.0f/(float)root));
FillArray(mN,(float)(root-1));
FillArray(r,(float)root);
//loads the arrays into the sse componenets
__m256 R = _mm256_load_ps(r);
__m256 Ni = _mm256_load_ps(iN);
__m256 Nm = _mm256_load_ps(mN);
//sets initaial guess to 1 / (a * root)
__m256 x = _mm256_rcp_ps(_mm256_mul_ps(R,a));
for(int i = 0; i < 20 ; i ++){
__m256 tmpx = x;
for(int k = 0 ; k < root -2 ; k++){
tmpx = _mm256_mul_ps(x,tmpx);
}
//f over f'
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//fmac with Ni*X+tar
tar = _mm256_fmadd_ps(Nm,x,tar);
//Multipled by Ni
x = _mm256_mul_ps(Ni,tar);
}
return x;
}
编辑#1
__m256 SSEnthRoot(__m256 a, int root){
__m256 R = _mm256_set1_ps((float)root);
__m256 Ni = _mm256_set1_ps((1.0f)/((float)root));
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,_mm256_rcp_ps(R));
for(int i = 0; i < 10 ; i ++){
__m256 tmpx = x;
for(int k = 0 ; k < root -2 ; k++){
tmpx = _mm256_mul_ps(x,tmpx);
}
//f over f'
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//mult nm x then add tar because my compiler stoped thinking that fmadd is a valid instruction
tar = _mm256_add_ps(_mm256_mul_ps(Nm,x),tar);
//Multiplied by the inverse of power
x = _mm256_mul_ps(Ni,tar);
}
return x;
}
任何使牛顿方法收敛更快的提示或指针(不是内存类型)将不胜感激。
使用 _mm256_rcp_ps() 在 _mm256_set1_ps() 函数调用中删除了编辑#2,因为我已经将所需内容的倒数加载到 R 中
__m256 SSEnthRoot(__m256 a, int root){
__m256 R = _mm256_set1_ps((float)root);
__m256 Ni = _mm256_rcp_ps(R);
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,Ni);
for(int i = 0; i < 20 ; i ++){
__m256 tmpx = x;
for(int k = 0 ; k < root -2 ; k++)
tmpx = _mm256_mul_ps(x,tmpx);
//f over f'
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//fmac with Ni*X+tar
//my compiler believes in fmac again
tar = _mm256_fmadd_ps(Nm,x,tar);
//Multiplied by the inverse of power
x = _mm256_mul_ps(Ni,tar);
}
return x;
}
编辑#3
__m256 SSEnthRoot(__m256 a, int root){
__m256 Ni = _mm256_set1_ps(1.0f/(float)root);
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,Ni);
for(int i = 0; i < 20 ; i ++){
__m256 tmpx = x;
for(int k = 0 ; k < root -2 ; k++)
tmpx = _mm256_mul_ps(x,tmpx);
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
tar = _mm256_fmadd_ps(Nm,x,tar);
x = _mm256_mul_ps(Ni,tar);
}
return x;
}