我在理解比较结果时遇到了一些问题。
配备 i7/intel hd4000 的笔记本电脑和配备 8Xeon 5400/7970 HDRadeon 的服务器。
我乘以:
int M =1024*2, N = 1024*6, P = 1024*2;
// N P
//|-----------| |-----------|
//| | | |
//|M | * |N |
//| | | |
//|-----------| |-----------|
这是内核:
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* Matrix multiplication: C = A * B.
* Device code.
*/
#ifndef BLOCK_SIZE
#define BLOCK_SIZE 16
#endif
#define AS(i, j) As[j + i * BLOCK_SIZE]
#define BS(i, j) Bs[j + i * BLOCK_SIZE]
///////////////////////////////////////////////////////////////////////////////
//! Matrix multiplication on the device: C = A * B
//! uiWA is A's width and uiWB is B's width
////////////////////////////////////////////////////////////////////////////////
__kernel void
m_m_mul( __global float* A, __global float* B, __global float* C,
/*__local float* As, __local float* Bs,*/ int uiWA, int uiWB, int trueLocalSize1)
{
__local float As[BLOCK_SIZE*BLOCK_SIZE];
__local float Bs[BLOCK_SIZE*BLOCK_SIZE];
// Block index
int bx = get_group_id(0);
int by = get_group_id(1);
// Thread index
int tx = get_local_id(0);
int ty = get_local_id(1);
// Index of the first sub-matrix of A processed by the block
int aBegin = uiWA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + uiWA - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * uiWB;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0.0f;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep) {
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
AS(ty, tx) = A[a + uiWA * ty + tx];
BS(ty, tx) = B[b + uiWB * ty + tx];
// Synchronize to make sure the matrices are loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
Csub += AS(ty, k) * BS(k, tx);
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
barrier(CLK_LOCAL_MEM_FENCE);
}
if (get_global_id(1) < trueLocalSize1)
// Write the block sub-matrix to device memory;
// each thread writes one element
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
}
我比较Eigen::Matrix<float,-1,-1,Eigen::RowMajor> m4 = m1 * m2
;
在服务器上:
Creating matrices on GPU....... Done [0ms]
Creating matrices on CPU....... Done [0ms]
Filling GPU with random numbers....... Done [19ms]
M3 = M1 * M2... on GPU (Loading Kernels)... Done [240ms]
M3 = M1 * M2... on GPU (3 times)... Done [211ms]
Loading M1, M2 on GPU... Done [93ms]
M4 = M1 * M2 on CPU... Done [7775ms] Error:3.78049e-008
Press any key to continue . . .
Matlab: Elapsed time is 3.010626 seconds.
在笔记本电脑上:
Creating matrices on GPU....... Done [22ms]
Creating matrices on CPU....... Done [0ms]
Filling GPU with random numbers....... Done [35ms]
M3 = M1 * M2... on GPU (Loading Kernels)... Done [2975ms]
M3 = M1 * M2... on GPU (3 times)... Done [6891ms]
Loading M1, M2 on GPU... Done [80ms]
M4 = M1 * M2 on CPU... Done [5966ms] Error:3.78049e-008
Press any key to continue . . .
Matlab: Elapsed time is 2.310626 seconds.
我的问题是现在。1)为什么笔记本电脑的本征比至强的8核更快。难道是eigen在两个系统上都只使用一个核心,而i7的时钟速度更高?2.0 vs 2.4?
2) 在 labtop 上使用 Intel HD4000 与 Eigen 相比,速度几乎提高了 3 倍,但 Matlab 需要 2.3 秒才能进行相同的乘法运算。这与 HD4000 上的内核相同。(我可以做些什么让 Eigen 以与 Matlab 相同的速度运行吗?)