我正在尝试将 Nx3 数组传递给内核并像在纹理内存中一样从它读取并写入第二个数组。这是我的 N=8 的简化代码:
#include <cstdio>
#include "handle.h"
using namespace std;
texture<float,2> tex_w;
__global__ void kernel(int imax, float(*w)[3], float (*f)[3])
{
  int i = threadIdx.x;
  int j = threadIdx.y;
  if(i<imax)
      f[i][j] = tex2D(tex_w, i, j);
}
void print_to_stdio(int imax, float (*w)[3])
{
  for (int i=0; i<imax; i++)
    {
      printf("%2d  %3.6f\t  %3.6f\t %3.6f\n",i, w[i][0], w[i][1], w[i][2]);
    }
}
int main(void)
{
  int imax = 8;
  float (*w)[3];
  float (*d_w)[3], (*d_f)[3];
  dim3 grid(imax,3);
  w = (float (*)[3])malloc(imax*3*sizeof(float));
  for(int i=0; i<imax; i++)
    {
      for(int j=0; j<3; j++)
        {
          w[i][j] = i + 0.01f*j;
        }
    }
  cudaMalloc( (void**) &d_w, 3*imax*sizeof(float) );
  cudaMalloc( (void**) &d_f, 3*imax*sizeof(float) );
  cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
  HANDLE_ERROR( cudaBindTexture2D(NULL, tex_w, d_w, desc, imax, 3, sizeof(float)*imax ) );
  cudaMemcpy(d_w, w, 3*imax*sizeof(float), cudaMemcpyHostToDevice);
  // just use threads for simplicity                                                                  
  kernel<<<1,grid>>>(imax, d_w, d_f);
  cudaMemcpy(w, d_f, 3*imax*sizeof(float), cudaMemcpyDeviceToHost);
  cudaUnbindTexture(tex_w);
  cudaFree(d_w);
  cudaFree(d_f);
  print_to_stdio(imax, w);
  free(w);
  return 0;
}
运行此代码我希望得到:
0  0.000000   0.010000   0.020000
1  1.000000   1.010000   1.020000
2  2.000000   2.010000   2.020000
3  3.000000   3.010000   3.020000
4  4.000000   4.010000   4.020000
5  5.000000   5.010000   5.020000
6  6.000000   6.010000   6.020000
7  7.000000   7.010000   7.020000
但相反我得到:
0  0.000000   2.020000   5.010000
1  0.010000   3.000000   5.020000
2  0.020000   3.010000   6.000000
3  1.000000   3.020000   6.010000
4  1.010000   4.000000   6.020000
5  1.020000   4.010000   7.000000
6  2.000000   4.020000   7.010000
7  2.010000   5.000000   7.020000
我认为这与我给 cudaBindTexture2D 的 pitch 参数有关,但使用较小的值会产生无效的参数错误。
提前致谢!