嗨,我在测试代码中没有发生预期的错误共享时遇到问题。
我正在尝试创建一个进程唯一的线程管理器,它可以均匀地管理多个线程。
唯一的线程管理器类不是线程池,它通过将任务函数分配给指定的线程进行操作,并且能够获得任务函数的返回值,这不仅仅是将任务推入队列而不考虑。此外,线程管理器不关心任务的大小(计算量)。
线程管理器将被一个线程(主线程)用于处理计算部分,并且它会被非常频繁地使用。这样做的原因是,我的流程将具有游戏循环设计模式,并且我想让游戏循环超过 120 FPS,这意味着必须在 8.3 毫秒内完成 1 个游戏循环。一个线程(主线程)可能会在 1 个游戏循环内多次分配此任务,因此减少/消除上下文切换成本是我的主要关注点。我的结论是让线程管理器的线程自旋锁。
简而言之,游戏循环将按照两个步骤进行多次迭代。
- 主循环将任务分配给线程管理器。
- 等待线程管理器的任务结果。
下面是我的测试代码。
线程管理器.h
namespace YSLibrary
{
class CThreadManager final
{
private:
static long long s_llLock;
static unsigned long long s_ullThreadCount;
static void** s_ppThreads;
static unsigned long* s_pThreadIDs;
static long long* s_pThreadQuits;
static long long* s_pTaskLocks;
static unsigned long long (**s_ppTasks)();
static unsigned long long* s_pTaskResults;
CThreadManager(){}
~CThreadManager(){}
__forceinline static void Lock()
{
while (true)
{
if (InterlockedCompareExchange64(&s_llLock, 1LL, 0LL) == 0LL)
{
return;
}
Sleep(0UL);
}
}
__forceinline static void Unlock()
{
InterlockedExchange64(&s_llLock, 0LL);
}
static unsigned long __stdcall Thread(void* const _pParameter)
{
const unsigned long long ullThreadIndex = reinterpret_cast<const unsigned long long>(_pParameter);
while (true)
{
if (InterlockedCompareExchange64(&s_pThreadQuits[ullThreadIndex], 0LL, 1LL) == 1LL)
{
return 1UL;
}
if (InterlockedCompareExchange64(&s_pTaskLocks[ullThreadIndex], 1LL, 0LL) == 0LL)
{
if (s_ppTasks[ullThreadIndex] != nullptr)
{
s_pTaskResults[ullThreadIndex] = s_ppTasks[ullThreadIndex]();
s_ppTasks[ullThreadIndex] = nullptr;
}
InterlockedExchange64(&s_pTaskLocks[ullThreadIndex], 0LL);
}
}
}
public:
enum class EResult : unsigned long long
{
None = 0ULL,
Success = 1ULL,
Fail_ArgumentNull = 2ULL,
Fail_ArgumentInvalid = 3ULL,
Fail_Locked = 4ULL,
Fail_ThreadCountNotZero = 5ULL,
Fail_ThreadCountZero = 6ULL,
Fail_ThreadsNotNull = 7ULL,
Fail_ThreadsNull = 8ULL,
Fail_ThreadIDsNotNull = 9ULL,
Fail_ThreadIDsNull = 10ULL,
Fail_ThreadQuitsNotNull = 11ULL,
Fail_ThreadQuitsNull = 12ULL,
Fail_TaskLocksNotNull = 13ULL,
Fail_TaskLocksNull = 14ULL,
Fail_TasksNotNull = 15ULL,
Fail_TasksNull = 16ULL,
Fail_TaskResultsNotNull = 17ULL,
Fail_TaskResultsNull = 18ULL,
Fail_CreateThread = 19ULL
};
__forceinline static EResult Initialize(const unsigned long long _ullThreadCount)
{
if (_ullThreadCount == 0ULL)
{
return EResult::Fail_ArgumentNull;
}
Lock();
if (s_ullThreadCount != 0ULL)
{
Unlock();
return EResult::Fail_ThreadCountNotZero;
}
if (s_ppThreads != nullptr)
{
Unlock();
return EResult::Fail_ThreadsNotNull;
}
if (s_pThreadIDs != nullptr)
{
Unlock();
return EResult::Fail_ThreadIDsNotNull;
}
if (s_pThreadQuits != nullptr)
{
Unlock();
return EResult::Fail_ThreadQuitsNotNull;
}
if (s_pTaskLocks != nullptr)
{
Unlock();
return EResult::Fail_TaskLocksNotNull;
}
if (s_ppTasks != nullptr)
{
Unlock();
return EResult::Fail_TasksNotNull;
}
if (s_pTaskResults != nullptr)
{
Unlock();
return EResult::Fail_TaskResultsNotNull;
}
s_ullThreadCount = _ullThreadCount;
s_ppThreads = new void*[s_ullThreadCount]{};
s_pThreadIDs = new unsigned long[s_ullThreadCount]{};
s_pThreadQuits = new long long[s_ullThreadCount]{};
s_pTaskLocks = new long long[s_ullThreadCount]{};
s_ppTasks = new (unsigned long long (*[s_ullThreadCount])()){};
s_pTaskResults = new unsigned long long[s_ullThreadCount]{};
for (unsigned long long i = 0ULL; i < s_ullThreadCount; ++i)
{
s_ppThreads[i] = CreateThread(nullptr, 0ULL, &Thread, reinterpret_cast<void*>(i), 0UL, &s_pThreadIDs[i]);
if (s_ppThreads[i] == nullptr)
{
// Rollback
for (unsigned long long j = 0ULL; j < i; ++j)
{
InterlockedExchange64(&s_pThreadQuits[i], 1LL);
}
unsigned long ulExitCode = 0UL;
for (unsigned long long j = 0ULL; j < i; ++j)
{
while (true)
{
GetExitCodeThread(s_ppThreads[j], &ulExitCode);
if (ulExitCode != static_cast<unsigned long>(STILL_ACTIVE))
{
CloseHandle(s_ppThreads[j]);
s_ppThreads[j] = nullptr;
break;
}
Sleep(0UL);
}
}
delete[] s_pTaskResults;
s_pTaskResults = nullptr;
delete[] s_ppTasks;
s_ppTasks = nullptr;
delete[] s_pTaskLocks;
s_pTaskLocks = nullptr;
delete[] s_pThreadQuits;
s_pThreadQuits = nullptr;
delete[] s_pThreadIDs;
s_pThreadIDs = nullptr;
delete[] s_ppThreads;
s_ppThreads = nullptr;
s_ullThreadCount = 0ULL;
Unlock();
return EResult::Fail_CreateThread;
}
}
Unlock();
return EResult::Success;
}
__forceinline static EResult Terminate()
{
Lock();
if (s_ullThreadCount == 0ULL)
{
Unlock();
return EResult::Fail_ThreadCountZero;
}
if (s_ppThreads == nullptr)
{
Unlock();
return EResult::Fail_ThreadsNull;
}
if (s_pThreadIDs == nullptr)
{
Unlock();
return EResult::Fail_ThreadIDsNull;
}
if (s_pThreadQuits == nullptr)
{
Unlock();
return EResult::Fail_ThreadQuitsNull;
}
if (s_pTaskLocks == nullptr)
{
Unlock();
return EResult::Fail_TaskLocksNull;
}
if (s_ppTasks == nullptr)
{
Unlock();
return EResult::Fail_TasksNull;
}
if (s_pTaskResults == nullptr)
{
Unlock();
return EResult::Fail_TaskResultsNull;
}
for (unsigned long long i = 0ULL; i < s_ullThreadCount; ++i)
{
InterlockedExchange64(&s_pThreadQuits[i], 1LL);
}
unsigned long ulExitCode = 0UL;
for (unsigned long long i = 0ULL; i < s_ullThreadCount; ++i)
{
while (true)
{
GetExitCodeThread(s_ppThreads[i], &ulExitCode);
if (ulExitCode != static_cast<unsigned long>(STILL_ACTIVE))
{
CloseHandle(s_ppThreads[i]);
s_ppThreads[i] = nullptr;
break;
}
Sleep(0UL);
}
}
delete[] s_pTaskResults;
s_pTaskResults = nullptr;
delete[] s_ppTasks;
s_ppTasks = nullptr;
delete[] s_pTaskLocks;
s_pTaskLocks = nullptr;
delete[] s_pThreadQuits;
s_pThreadQuits = nullptr;
delete[] s_pThreadIDs;
s_pThreadIDs = nullptr;
delete[] s_ppThreads;
s_ppThreads = nullptr;
s_ullThreadCount = 0ULL;
Unlock();
return EResult::Success;
}
__forceinline static EResult Execute(const unsigned long long _ullThreadIndex, unsigned long long (*_pFunction)())
{
if (_pFunction == nullptr)
{
return EResult::Fail_ArgumentNull;
}
Lock();
if (s_ullThreadCount == 0ULL)
{
Unlock();
return EResult::Fail_ThreadCountZero;
}
if (s_ppThreads == nullptr)
{
Unlock();
return EResult::Fail_ThreadsNull;
}
if (s_pThreadIDs == nullptr)
{
Unlock();
return EResult::Fail_ThreadIDsNull;
}
if (s_pThreadQuits == nullptr)
{
Unlock();
return EResult::Fail_ThreadQuitsNull;
}
if (s_pTaskLocks == nullptr)
{
Unlock();
return EResult::Fail_TaskLocksNull;
}
if (s_ppTasks == nullptr)
{
Unlock();
return EResult::Fail_TasksNull;
}
if (s_pTaskResults == nullptr)
{
Unlock();
return EResult::Fail_TaskResultsNull;
}
if (_ullThreadIndex >= s_ullThreadCount)
{
Unlock();
return EResult::Fail_ArgumentInvalid;
}
while (true)
{
if (InterlockedCompareExchange64(&s_pTaskLocks[_ullThreadIndex], 1LL, 0LL) == 0LL)
{
s_ppTasks[_ullThreadIndex] = _pFunction;
InterlockedExchange64(&s_pTaskLocks[_ullThreadIndex], 0LL);
Unlock();
return EResult::Success;
}
Sleep(0UL);
}
}
__forceinline static EResult WaitForResult(const unsigned long long _ullThreadIndex, unsigned long long* const _pFunctionResult)
{
if (_pFunctionResult == nullptr)
{
return EResult::Fail_ArgumentNull;
}
Lock();
if (s_ullThreadCount == 0ULL)
{
Unlock();
return EResult::Fail_ThreadCountZero;
}
if (s_ppThreads == nullptr)
{
Unlock();
return EResult::Fail_ThreadsNull;
}
if (s_pThreadIDs == nullptr)
{
Unlock();
return EResult::Fail_ThreadIDsNull;
}
if (s_pThreadQuits == nullptr)
{
Unlock();
return EResult::Fail_ThreadQuitsNull;
}
if (s_pTaskLocks == nullptr)
{
Unlock();
return EResult::Fail_TaskLocksNull;
}
if (s_ppTasks == nullptr)
{
Unlock();
return EResult::Fail_TasksNull;
}
if (s_pTaskResults == nullptr)
{
Unlock();
return EResult::Fail_TaskResultsNull;
}
if (_ullThreadIndex >= s_ullThreadCount)
{
Unlock();
return EResult::Fail_ArgumentInvalid;
}
while (true)
{
if (InterlockedCompareExchange64(&s_pTaskLocks[_ullThreadIndex], 1LL, 0LL) == 0LL)
{
if (s_ppTasks[_ullThreadIndex] == nullptr)
{
(*_pFunctionResult) = s_pTaskResults[_ullThreadIndex];
InterlockedExchange64(&s_pTaskLocks[_ullThreadIndex], 0LL);
Unlock();
return EResult::Success;
}
InterlockedExchange64(&s_pTaskLocks[_ullThreadIndex], 0LL);
}
Sleep(0UL);
}
}
};
}
主文件
#include <iostream>
#include <Windows.h>
#include "ThreadManager.h"
long long YSLibrary::CThreadManager::s_llLock = 0LL;
unsigned long long YSLibrary::CThreadManager::s_ullThreadCount = 0ULL;
void** YSLibrary::CThreadManager::s_ppThreads = nullptr;
unsigned long* YSLibrary::CThreadManager::s_pThreadIDs = nullptr;
long long* YSLibrary::CThreadManager::s_pThreadQuits = nullptr;
long long* YSLibrary::CThreadManager::s_pTaskLocks = nullptr;
unsigned long long (**YSLibrary::CThreadManager::s_ppTasks)() = nullptr;
unsigned long long* YSLibrary::CThreadManager::s_pTaskResults = nullptr;
unsigned long long g_pResults[10]{};
struct SData
{
unsigned long long ullData[8];
};
SData g_stData{};
SData g_stData0{};
SData g_stData1{};
SData g_stData2{};
SData g_stData3{};
SData g_stData4{};
SData g_stData5{};
SData g_stData6{};
unsigned long long Function()
{
for (unsigned long long i = 0ULL; i < 70000000ULL; ++i)
{
g_stData.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function0()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData0.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function1()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData1.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function2()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData2.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function3()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData3.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function4()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData4.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function5()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData5.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function6()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData6.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
int main()
{
unsigned long long ullStartTick = 0ULL;
unsigned long long ullEndTick = 0ULL;
srand((unsigned int)time(nullptr));
ullStartTick = GetTickCount64();
Function();
ullEndTick = GetTickCount64();
std::wcout << L"[Main]" << std::endl;
std::wcout << ullEndTick - ullStartTick << std::endl;
YSLibrary::CThreadManager::EResult eResult = YSLibrary::CThreadManager::EResult::None;
eResult = YSLibrary::CThreadManager::Initialize(7ULL);
ullStartTick = GetTickCount64();
eResult = YSLibrary::CThreadManager::Execute(0ULL, &Function0);
eResult = YSLibrary::CThreadManager::Execute(1ULL, &Function1);
eResult = YSLibrary::CThreadManager::Execute(2ULL, &Function2);
eResult = YSLibrary::CThreadManager::Execute(3ULL, &Function3);
eResult = YSLibrary::CThreadManager::Execute(4ULL, &Function4);
eResult = YSLibrary::CThreadManager::Execute(5ULL, &Function5);
eResult = YSLibrary::CThreadManager::Execute(6ULL, &Function6);
eResult = YSLibrary::CThreadManager::WaitForResult(0ULL, &g_pResults[0]);
eResult = YSLibrary::CThreadManager::WaitForResult(1ULL, &g_pResults[1]);
eResult = YSLibrary::CThreadManager::WaitForResult(2ULL, &g_pResults[2]);
eResult = YSLibrary::CThreadManager::WaitForResult(3ULL, &g_pResults[3]);
eResult = YSLibrary::CThreadManager::WaitForResult(4ULL, &g_pResults[4]);
eResult = YSLibrary::CThreadManager::WaitForResult(5ULL, &g_pResults[5]);
eResult = YSLibrary::CThreadManager::WaitForResult(6ULL, &g_pResults[6]);
ullEndTick = GetTickCount64();
std::wcout << L"[Thread Manager]" << std::endl;
std::wcout << ullEndTick - ullStartTick << std::endl;
YSLibrary::CThreadManager::Terminate();
system("pause");
return 0;
}
对于 Interlocked 系列函数、__forceinline、静态变量的脏声明等,我感到非常抱歉。
另一方面,我使用“long long”作为锁变量的原因是没有“bool”类型。我宁愿尝试“短”,但当我测量“短”和“长”之间的时间时,它没有显着差异。相反,“short”稍微慢了一点,我猜原因是在 64 位环境中使用了 16 位寄存器。此外,bool 或 short 类型可能会导致内存对齐问题。所以我使用了“long long”类型。
CThreadManager 之所以有私有构造函数,是为了明确禁止“new CThreadManager()”。
“reinterpret_cast”的使用被最小化。我认为它的成本是编译时间,但我从 stackoverflow 看到一个问题,它有运行时成本。我还不确定。所以只需在线程函数开始时使用它一次。
到目前为止,我已经通过更改检查了虚假共享现象
SData::ullData[8] -> SData::ullData 1
此外,使用 Sleep(0) 显着减少了 WaitForResult() 中线程时间片的浪费,并减少了线程内的总执行时间。
这段代码的结果显示
[Main]
1828
[Thread Manager]
344
在我的环境中。
但是,我才意识到除了 SData::ullData 之外,还有另一个地方肯定会发生错误共享,即s_pThreadQuits、s_pTaskLocks、s_ppTasks、s_pTaskResults。
为什么这些变量不会发生错误共享?
[编辑]
我所说的“错误共享”是指“不同线程访问但共享相同缓存线的内存地址”是
- SData g_stDataN(在每个 FunctionN() 中)
- s_pThreadQuits、s_pTaskLocks、s_pTaskResults、s_ppTasks(在 Thread() 中)
我想到了 2. 变量也将加载到缓存中,就像 g_stDataN(在我的环境中为 64 字节)所做的那样。我将 SData 的大小设置为 64 字节,以实现“填充”方法的结果,以避免错误共享。
但是,就 s_pThreadQuits 的大小既不是 64 字节也不是填充的而言,它也应该具有错误共享。
就像下面这张图片。
图片来源来自 https://www.codeproject.com/Articles/85356/Avoiding-and-Identifying-False-Sharing-Among-Threa