我在 C++ 中有以下代码(稍后解释):
#include <stdio.h>
#include <string>
#include <vector>
using namespace std;
struct th_private{
double mean_tau;
th_private()
{
mean_tau = 0;
}
};
class resistor
{
public:
string name;
/*****************************************************************************
Approach 0: Within each resistor strcuture, declare arrays of 'thread private'
variables. Thread 0 will use mean_tau[0], offset[0].., Thread 1 will use
mean_tau[1], offset[1]... and so on. As I understand, this is not a good
approach, would lead to a lot of false sharing.
/*****************************************************************************/
vector<double> mean_tau;
/*****************************************************************************
Approach 1: 1D array of struct th_private in each instance of the resistor,
where state[0] is used ONLY by thread[0], state[0] is used ONLY by thread[1]
and so on. Could potentially elimiate false sharing, but how to ensure
it will align in the cache?
/*****************************************************************************/
vector<th_private> state;
resistor( )
{
name = "";
}
void prepare_for_threads( int num_threads )
{
/* If Approach 0 */
mean_tau.resize(num_threads);
/* Else If Approach 1 */
state.resize(num_threads);
}
~resistor(){}
};
class mesh
{
public:
vector<resistor*> R;
mesh( int num_resistors, int num_threads )
{
for( int i = 0; i < num_resistors; i++ )
{
resistor *r = new resistor();
r->prepare_for_threads( num_threads );
R.push_back(r);
}
}
~mesh(){}
};
/*****************************************************************************
Approach 2: Declare a global 2D matrix, where each row belongs to a
thread and each column belongs to a resistor. Seems to be the best approach.
R[0] R[1] R[2] R[3] R[4] R[9]
thread0: [0][0] [0][1] [0][2] [0][3] [0][4] .. [0][9]
...
thread3: [3][0] [3][1] [3][2] [3][3] [3][4] .. [3][9]
/*****************************************************************************/
th_private __attribute__((aligned(0x1000))) global_state[4][10];
int main( int argc, char** argv )
{
// Assume that there are 4 threads declared.
mesh grid(10, 4);
printf("sizeof(th_private): %d\n", sizeof(th_private));
printf("Approach 1: %p %p %p %p\n", &grid.R[0]->state[0], &grid.R[0]->state[1], &grid.R[0]->state[2], &grid.R[0]->state[3]);
printf("Approach 2: %p %p %p %p\n", &global_state[0][0], &global_state[0][1], &global_state[0][2], &global_state[0][3]);
}
在 64 位 linux 机器上的输出是:
sizeof(th_private): 8
Approach 1: 0x658080 0x658088 0x658090 0x658098
Approach 2: 0x608000 0x608008 0x608010 0x608018
每个电阻器都有一组属性,这些属性将由线程修改(读取和写入)。理想情况下,它们可以被视为线程私有变量。但是,由于旧代码库施加的一些限制,我只能采用以下三种方法之一:
- 方法 0:在每个电阻器结构中,声明“线程私有”变量数组。线程 0 将使用 mean_tau[0]、offset[0]..,线程 1 将使用 mean_tau[1]、offset[1]...等等。据我了解,这不是一个好方法,会导致很多虚假分享。
- 方法 1:电阻器的每个实例中的 struct th_private 的一维数组,其中 state[0] 仅由 thread[0] 使用,state[0] 仅由 thread[1] 使用,依此类推。可能会消除错误共享,但如何确保它在缓存中对齐?
- 方法 2:声明一个全局 2D 矩阵,其中每一行属于一个线程,每一列属于一个电阻器(代码中的更多详细信息)。
现在,i) 避免错误共享和 ii) 缓存对齐,哪种方法最好?