全部:
我正在学习共享内存如何加速 GPU 编程过程。我正在使用下面的代码来计算每个元素的平方值加上其左右邻居的平均值的平方值。代码运行,但是结果并不如预期。
打印出来的前 10 个结果是 0,1,2,3,4,5,6,7,8,9,而我期望结果是 25,2,8, 18,32,50,72,98,128,162;
代码如下,参考这里;
你能告诉我哪一部分出了问题吗?非常感激你的帮助。
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda.h>
const int N=1024;
 __global__ void compute_it(float *data)
 {
 int tid = threadIdx.x;
 __shared__ float myblock[N];
 float tmp;
 // load the thread's data element into shared memory
 myblock[tid] = data[tid];
 // ensure that all threads have loaded their values into
 // shared memory; otherwise, one thread might be computing
 // on unitialized data.
 __syncthreads();
 // compute the average of this thread's left and right neighbors
 tmp = (myblock[tid>0?tid-1:(N-1)] + myblock[tid<(N-1)?tid+1:0]) * 0.5f;
 // square the previousr result and add my value, squared
 tmp = tmp*tmp + myblock[tid]*myblock[tid];
 // write the result back to global memory
 data[tid] = myblock[tid];
 __syncthreads();
  }
int main (){
char key;
float *a;
float *dev_a;
a = (float*)malloc(N*sizeof(float));
cudaMalloc((void**)&dev_a,N*sizeof(float));
for (int i=0; i<N; i++){
a [i] = i;
}
cudaMemcpy(dev_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
compute_it<<<N,1>>>(dev_a);
cudaMemcpy(a, dev_a, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0; i<10; i++){
std::cout<<a [i]<<",";
}
std::cin>>key;
free (a);
free (dev_a);