我不知道这是否是一个正确的答案。所以如果我错了,请纠正我,因为我不知道测试是否足够合适。所以我决定对向量加法进行如下测试
#include <thrust/device_vector.h>
#include <iostream>
__global__ void
vectorAdd(const int *A, const int *B, int *C, int numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) C[i] = A[i] + B[i];
};
int main(void)
{
int numElements = 1024;
int randacc = 30;
cudaSetDevice(0);
thrust::device_vector<int> a(numElements, 1);
thrust::device_vector<int> b(numElements, 2);
thrust::device_vector<int> c(numElements);
int* a_d = thrust::raw_pointer_cast(&a[0]);
int* b_d = thrust::raw_pointer_cast(&b[0]);
int* c_d = thrust::raw_pointer_cast(&c[0]);
int threadsPerBlock = 64;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(a_d, b_d, c_d, numElements);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) std::cerr << cudaGetErrorString(err) << std::endl;
std::cout << "random access on dev 0, c = " << c[randacc] << std::endl;
a.clear(); thrust::device_vector<int>().swap(a); //deallocate the vector
b.clear(); thrust::device_vector<int>().swap(b); //deallocate the vector
c.clear(); thrust::device_vector<int>().swap(c); //deallocate the vector
cudaSetDevice(1); //set to device:1
a.resize(numElements, 1);
b.resize(numElements, 2);
c.resize(numElements);
a_d = thrust::raw_pointer_cast(&a[0]);
b_d = thrust::raw_pointer_cast(&b[0]);
c_d = thrust::raw_pointer_cast(&c[0]);
threadsPerBlock = 64;
blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(a_d, b_d, c_d, numElements);
err = cudaGetLastError();
if (err != cudaSuccess) std::cerr << cudaGetErrorString(err) << std::endl;
std::cout << "random access on dev 1, c = " << c[randacc] << std::endl;
return 0;
}
我得到结果:
dev 0 上的随机访问,c = 3
开发 1 上的随机访问,c = 3
注意:您需要在同一主机上至少有 2 个 GPU 来测试它。我在我的 GTX690 上测试过