Robert Crovella 已经指出您的问题是停止规则,它是根据迭代次数给出的。
对于最低限度更复杂的二等分,停止规则也可以与目标精度相关。下面我提供了 CUDA 中二分法的一个版本,改编自C++ 中的数值食谱一书中可用的方法,您还可以通过它设置目标精度。
也许,可以通过调整eigenvalues
CUDA SDK 示例中利用的二等分内核来获得计算上更复杂的二等分。
该方法的新版本似乎更准确。下面是一些结果:
No target accuracy
1.571289062500
4.453613281250
6.504882812500
10.546875000000
13.171386718750
Target accuracy
1.570796326795
4.712388980385
7.853981633975
10.995574287564
14.137166941154
Actual roots
1.570796326794897
4.712388980384690
7.853981633974483
10.995574287564276
14.137166941154069
可以通过上述书中提供的方法再次获得更好的初始包围。
这是代码
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <math_constants.h>
#define BLOCKSIZE 512
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/************************************/
/* FUNCTION TO SEARCH THE ROOTS FOR */
/************************************/
__host__ __device__ double f(double x)
{
//return ((5*sin(2*x))-(52*cos(2*x)))+50;
return cos(x);
}
/***************************************/
/* BISECTION KERNEL - ORIGINAL VERSION */
/***************************************/
__global__ void bisection(double *a, double *b, double *c, int N, int loopcnt)
{
int tid = blockIdx.x*blockDim.x+threadIdx.x;
int loops = 0;
if (tid < N)
while (loops < loopcnt){
c[tid] = (a[tid] + b[tid])/2;
if ((f(c[tid]) * f(a[tid])) < 0) b[tid] = c[tid];
else a[tid] = c[tid];
loops++;
}
}
/************************************************/
/* BISECTION KERNEL - NUMERICAL RECIPES VERSION */
/************************************************/
// --- Using bisection, return the root of a function func known to lie between x1 and x2.
// The root will be refined until its accuracy is xacc.
__global__ void bisection_NR(const double *d_x1, const double *d_x2, double *d_roots, const double xacc, const int loopcnt, const int N) {
// --- loopcnt is the maximum allowed number of bisections.
int tid = blockIdx.x*blockDim.x+threadIdx.x;
if (tid < N) {
double dx,xmid,rtb;
double f1=f(d_x1[tid]);
double fmid=f(d_x2[tid]);
if (f1*fmid >= 0.0) d_roots[tid] = CUDART_NAN;
rtb = f1 < 0.0 ? (dx=d_x2[tid]-d_x1[tid],d_x1[tid]) : (dx=d_x1[tid]-d_x2[tid],d_x2[tid]); // --- Orient the search so that f>0
for (int j=0;j<loopcnt;j++) { // --- lies at x+dx.
fmid=f(xmid=rtb+(dx *= 0.5)); // --- Bisection loop.
if (fmid <= 0.0) rtb=xmid;
if (abs(dx) < xacc || fmid == 0.0) { d_roots[tid]=rtb; return; }
}
d_roots[tid] = CUDART_NAN;
}
}
/*******/
/* INT */
/*******/
int main()
{
int loops=100000; // --- Number of bisection iterations to run
double x1=0.0, x2=10.0; // --- Minimum and maximum values of the search interval
double Deltax = 1.0; // --- Sampling step of the search interval
int N = (int)x2/(int)Deltax; // --- Number of search intervales
// --- Host-side memory allocations
double *host_a = (double*)malloc(N*sizeof(double));
double *host_b = (double*)malloc(N*sizeof(double));
double *host_c = (double*)malloc(N*sizeof(double));
// --- Device-side memory allocations
double *dev_a; gpuErrchk(cudaMalloc(&dev_a, N*sizeof(double)));
double *dev_b; gpuErrchk(cudaMalloc(&dev_b, N*sizeof(double)));
double *dev_c; gpuErrchk(cudaMalloc(&dev_c, N*sizeof(double)));
// --- Initialize vectors on host
int i=0;
while(i < N) {
if((f(x1)*f(x1+Deltax))>0) x1 = x1 + Deltax;
else {
host_a[i] = x1;
host_b[i] = x1+Deltax;
x1 = x1 + Deltax;
i++;
}
}
// --- Copy host vectors to device
gpuErrchk(cudaMemcpy(dev_a, host_a, N*sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(dev_b, host_b, N*sizeof(double), cudaMemcpyHostToDevice));
bisection<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(dev_a, dev_b, dev_c, loops, N);
gpuErrchk(cudaMemcpy(host_c, dev_c, N*sizeof(double), cudaMemcpyDeviceToHost));
for(i=0; i<N; i++) printf("%3.12f\n",host_c[i]);
printf("\n");
bisection_NR<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(dev_a, dev_b, dev_c, 2.5e-13, loops, N);
gpuErrchk(cudaMemcpy(host_c, dev_c, N*sizeof(double), cudaMemcpyDeviceToHost));
for(i=0; i<N; i++) printf("%3.12f\n",host_c[i]);
// --- Release device memory
gpuErrchk(cudaFree(dev_a));
gpuErrchk(cudaFree(dev_b));
gpuErrchk(cudaFree(dev_c));
// --- Release host memory
free(host_a);
free(host_b);
free(host_c);
return 0;
}