我正在为性能测试编写一个简单的 CUDA 程序。
这与向量计算无关,仅用于简单的(并行)字符串转换。
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#define UCHAR unsigned char
#define UINT32 unsigned long int
#define CTX_SIZE sizeof(aes_context)
#define DOCU_SIZE 4096
#define TOTAL 100000
#define BBLOCK_SIZE 500
UCHAR pH_TXT[DOCU_SIZE * TOTAL];
UCHAR pH_ENC[DOCU_SIZE * TOTAL];
UCHAR* pD_TXT;
UCHAR* pD_ENC;
__global__
void TEST_Encode( UCHAR *a_input, UCHAR *a_output )
{
UCHAR *input;
UCHAR *output;
input = &(a_input[threadIdx.x * DOCU_SIZE]);
output = &(a_output[threadIdx.x * DOCU_SIZE]);
for ( int i = 0 ; i < 30 ; i++ ) {
if ( (input[i] >= 'a') && (input[i] <= 'z') ) {
output[i] = input[i] - 'a' + 'A';
}
else {
output[i] = input[i];
}
}
}
int main(int argc, char** argv)
{
struct cudaDeviceProp xCUDEV;
cudaGetDeviceProperties(&xCUDEV, 0);
// Prepare Source
memset(pH_TXT, 0x00, DOCU_SIZE * TOTAL);
for ( int i = 0 ; i < TOTAL ; i++ ) {
strcpy((char*)pH_TXT + (i * DOCU_SIZE), "hello world, i need an apple.");
}
// Allocate vectors in device memory
cudaMalloc((void**)&pD_TXT, DOCU_SIZE * TOTAL);
cudaMalloc((void**)&pD_ENC, DOCU_SIZE * TOTAL);
// Copy vectors from host memory to device memory
cudaMemcpy(pD_TXT, pH_TXT, DOCU_SIZE * TOTAL, cudaMemcpyHostToDevice);
// Invoke kernel
int threadsPerBlock = BLOCK_SIZE;
int blocksPerGrid = (TOTAL + threadsPerBlock - 1) / threadsPerBlock;
printf("Total Task is %d\n", TOTAL);
printf("block size is %d\n", threadsPerBlock);
printf("repeat cnt is %d\n", blocksPerGrid);
TEST_Encode<<<blocksPerGrid, threadsPerBlock>>>(pD_TXT, pD_ENC);
cudaMemcpy(pH_ENC, pD_ENC, DOCU_SIZE * TOTAL, cudaMemcpyDeviceToHost);
// Free device memory
if (pD_TXT) cudaFree(pD_TXT);
if (pD_ENC) cudaFree(pD_ENC);
cudaDeviceReset();
}
当我将 BLOCK_SIZE 值从 2 更改为 1000 时,我得到了以下持续时间(来自 NVIDIA Visual Profiler)
TOTAL BLOCKS BLOCK_SIZE Duration(ms)
100000 50000 2 28.22
100000 10000 10 22.223
100000 2000 50 12.3
100000 1000 100 9.624
100000 500 200 10.755
100000 250 400 29.824
100000 200 500 39.67
100000 100 1000 81.268
我的 GPU 是 GeForce GT520,max threadsPerBlock 值为 1024,所以我预测当 BLOCK 为 1000 时我会获得最佳性能,但上表显示了不同的结果。
我不明白为什么持续时间不是线性的,我该如何解决这个问题。(或者我怎样才能找到优化的块值(最小持续时间)