以供参考:
template <typename T,
typename UserAllocator = default_user_allocator_new_delete,
typename Mutex = details::pool::default_mutex,
unsigned NextSize = 32,
unsigned MaxSize = 0>
class pool_allocator;
我想也许锁定是罪魁祸首。此外,可能会有更好的暗示。
让我们测试一下!Live On 编译器资源管理器
#include <boost/core/demangle.hpp>
#include <boost/pool/pool_alloc.hpp>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <vector>
using namespace std::chrono_literals;
auto static now = std::chrono::high_resolution_clock::now;
template <typename Alloc> void test(int run, Alloc alloc = {}) {
auto load = [=](bool RESERVE, unsigned ITERATIONS = 1'000, unsigned SIZE = 10'000) {
for (unsigned i = 0; i < ITERATIONS; ++i) {
std::vector<int, Alloc> vec(alloc);
if (RESERVE)
vec.reserve(SIZE);
for (unsigned j = 0; j < SIZE; ++j)
vec.push_back(i + j);
}
};
auto lap_time = [t0 = now()]() mutable {
return now() - std::exchange(t0, now());
};
load(false); auto without_reserve = lap_time() / 1.0ms;
load(true); auto with_reserve = lap_time() / 1.0ms;
std::cout << "run " << run //
<< " naive: " << std::setw(7) << without_reserve << "ms" //
<< " reserved: " << std::setw(7) << with_reserve << "ms" //
<< "(" << boost::core::demangle(typeid(Alloc).name()) << ")" //
<< std::endl;
}
void run_tests(int run) {
test<std::allocator<int>>(run);
using NullMx = boost::details::pool::null_mutex;
using Mx = boost::details::pool::default_mutex;
using Malloc = boost::default_user_allocator_malloc_free;
using NewDelete = boost::default_user_allocator_new_delete;
//
// no hints
//
test<boost::pool_allocator<int, Malloc, NullMx>>(run);
test<boost::pool_allocator<int, NewDelete, NullMx>>(run);
test<boost::pool_allocator<int, Malloc, Mx>>(run);
test<boost::pool_allocator<int, NewDelete, Mx>>(run);
//
// hinted
//
test<boost::pool_allocator<int, Malloc, NullMx, 1'000, 0>>(run);
test<boost::pool_allocator<int, NewDelete, NullMx, 1'000, 0>>(run);
test<boost::pool_allocator<int, Malloc, Mx, 1'000, 0>>(run);
test<boost::pool_allocator<int, NewDelete, Mx, 1'000, 0>>(run);
}
int main()
{
std::cout << std::fixed << std::setprecision(3);
for (int run : {1,2,3}) {
auto t0 = now();
run_tests(run);
std::cout << " -- Done (" << (now() - t0) / 1.ms << "ms)" << std::endl;
}
}
编译器资源管理器显示了一些真正的不一致峰值;我自己的机器没有:
run 1 naive: 8.025ms reserved: 5.412ms(std::allocator<int>)
run 1 naive: 92.212ms reserved: 31.166ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, boost::details::pool::null_mutex, 32u, 0u>)
run 1 naive: 93.466ms reserved: 29.901ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, boost::details::pool::null_mutex, 32u, 0u>)
run 1 naive: 92.488ms reserved: 29.883ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, std::mutex, 32u, 0u>)
run 1 naive: 92.450ms reserved: 29.824ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 32u, 0u>)
run 1 naive: 82.879ms reserved: 27.478ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, boost::details::pool::null_mutex, 1000u, 0u>)
run 1 naive: 82.775ms reserved: 28.187ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, boost::details::pool::null_mutex, 1000u, 0u>)
run 1 naive: 83.189ms reserved: 27.404ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, std::mutex, 1000u, 0u>)
run 1 naive: 83.159ms reserved: 27.468ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 1000u, 0u>)
-- Done (947.595ms)
run 2 naive: 8.007ms reserved: 5.543ms(std::allocator<int>)
run 2 naive: 92.225ms reserved: 29.882ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, boost::details::pool::null_mutex, 32u, 0u>)
run 2 naive: 92.311ms reserved: 29.805ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, boost::details::pool::null_mutex, 32u, 0u>)
run 2 naive: 92.601ms reserved: 29.873ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, std::mutex, 32u, 0u>)
run 2 naive: 92.421ms reserved: 30.028ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 32u, 0u>)
run 2 naive: 83.028ms reserved: 27.493ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, boost::details::pool::null_mutex, 1000u, 0u>)
run 2 naive: 82.822ms reserved: 27.427ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, boost::details::pool::null_mutex, 1000u, 0u>)
run 2 naive: 83.230ms reserved: 27.493ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, std::mutex, 1000u, 0u>)
run 2 naive: 83.104ms reserved: 27.466ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 1000u, 0u>)
-- Done (944.958ms)
run 3 naive: 8.068ms reserved: 5.422ms(std::allocator<int>)
run 3 naive: 92.282ms reserved: 29.880ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, boost::details::pool::null_mutex, 32u, 0u>)
run 3 naive: 92.064ms reserved: 29.960ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, boost::details::pool::null_mutex, 32u, 0u>)
run 3 naive: 92.339ms reserved: 29.928ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, std::mutex, 32u, 0u>)
run 3 naive: 92.977ms reserved: 29.890ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 32u, 0u>)
run 3 naive: 82.906ms reserved: 27.388ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, boost::details::pool::null_mutex, 1000u, 0u>)
run 3 naive: 82.784ms reserved: 27.585ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, boost::details::pool::null_mutex, 1000u, 0u>)
run 3 naive: 83.157ms reserved: 28.233ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, std::mutex, 1000u, 0u>)
run 3 naive: 83.098ms reserved: 27.466ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 1000u, 0u>)
-- Done (945.629ms)
不过,显然,总是慢。来介绍一下
探查器
仅将标准分配器与boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 1000u, 0u>
:
到目前为止,一切都很好!那是什么花费了这么多时间?
从各种 Boost Pool 标头中unordered_malloc
内联到大量行。最严重的违规者来自boost/pool/simple_segregated_storage.hpp
:(第二列是成本相对于父母的百分比):
这些行在try_malloc_n
template <typename SizeType>
void * simple_segregated_storage<SizeType>::try_malloc_n(
void * & start, size_type n, const size_type partition_size)
{
void * iter = nextof(start);
while (--n != 0)
{
void * next = nextof(iter);
if (next != static_cast<char *>(iter) + partition_size)
{
// next == 0 (end-of-list) or non-contiguous chunk found
start = iter;
return 0;
}
iter = next;
}
return iter;
}
其中自我描述为:
该函数尝试在空闲列表中找到 n 个大小为 partition_size 的连续块,从 start 开始。如果成功,则返回该连续序列中的最后一个块,以便 [start, {retval}] 知道该序列。连续的块。在任何一种情况下,它都会返回 0,并将 start 设置为最后考虑的块。如果 nextof(start) == 0,则您位于空闲列表的末尾。否则,start 指向连续序列中的最后一个块,nextof(start) 指向下一个连续序列中的第一个块(假设有序免费列表)。
看来,在这种情况下,在隔离堆上追逐空闲块的成本毕竟太高了。一个小小的餐巾纸计算表明,它try_malloc_n
占用了我们之前看到的更高级别unordered_malloc
调用的 99.75%。
SHOCK:替代实施?
在我的调查中,我发现了一些可以用来获得更多洞察力的定义,例如:
#define NDEBUG
//#define BOOST_POOL_INSTRUMENT 1
//#define BOOST_POOL_VALIDATE 1
//#define BOOST_POOL_VALGRIND 1
现在,我使用 VALIDATE/INSTRUMENT 观察预期的效果(非常冗长的输出和轻微的性能下降)。
通过阅读名称/代码,我预计BOOST_POOL_VALGRIND
会类似地降低性能(毕竟,它可能应该做额外的工作以避免在运行 Valgrind 时出现误报内存错误,对吧?)。令我惊讶的是,定义它使整个事情运行得很快:Live On Compiler Explorer
run 1 naive: 8.166ms reserved: 5.267ms(std::allocator<int>)
run 1 naive: 9.713ms reserved: 5.267ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, boost::details::pool::null_mutex, 32u, 0u>)
run 1 naive: 8.853ms reserved: 5.226ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, boost::details::pool::null_mutex, 32u, 0u>)
run 1 naive: 8.990ms reserved: 5.282ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, std::mutex, 32u, 0u>)
run 1 naive: 8.899ms reserved: 5.246ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 32u, 0u>)
run 1 naive: 8.620ms reserved: 5.237ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, boost::details::pool::null_mutex, 1000u, 0u>)
run 1 naive: 8.622ms reserved: 5.247ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, boost::details::pool::null_mutex, 1000u, 0u>)
run 1 naive: 8.963ms reserved: 5.257ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, std::mutex, 1000u, 0u>)
run 1 naive: 8.990ms reserved: 5.271ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 1000u, 0u>)
-- Done (127.276ms)
run 2 naive: 7.965ms reserved: 5.208ms(std::allocator<int>)
run 2 naive: 8.503ms reserved: 5.236ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, boost::details::pool::null_mutex, 32u, 0u>)
run 2 naive: 8.809ms reserved: 5.254ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, boost::details::pool::null_mutex, 32u, 0u>)
run 2 naive: 8.954ms reserved: 5.278ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, std::mutex, 32u, 0u>)
run 2 naive: 8.878ms reserved: 5.279ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 32u, 0u>)
run 2 naive: 8.694ms reserved: 5.243ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, boost::details::pool::null_mutex, 1000u, 0u>)
run 2 naive: 8.661ms reserved: 5.249ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, boost::details::pool::null_mutex, 1000u, 0u>)
run 2 naive: 8.920ms reserved: 5.248ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, std::mutex, 1000u, 0u>)
run 2 naive: 8.952ms reserved: 5.261ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 1000u, 0u>)
-- Done (125.680ms)
run 3 naive: 7.949ms reserved: 5.221ms(std::allocator<int>)
run 3 naive: 8.498ms reserved: 5.238ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, boost::details::pool::null_mutex, 32u, 0u>)
run 3 naive: 8.813ms reserved: 5.230ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, boost::details::pool::null_mutex, 32u, 0u>)
run 3 naive: 9.033ms reserved: 5.279ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, std::mutex, 32u, 0u>)
run 3 naive: 8.909ms reserved: 5.252ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 32u, 0u>)
run 3 naive: 8.605ms reserved: 5.244ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, boost::details::pool::null_mutex, 1000u, 0u>)
run 3 naive: 8.623ms reserved: 5.246ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, boost::details::pool::null_mutex, 1000u, 0u>)
run 3 naive: 8.918ms reserved: 5.247ms(boost::pool_allocator<int, boost::default_user_allocator_malloc_free, std::mutex, 1000u, 0u>)
run 3 naive: 8.969ms reserved: 5.268ms(boost::pool_allocator<int, boost::default_user_allocator_new_delete, std::mutex, 1000u, 0u>)
可悲的是,查看细节确认它通过委托给标准库直接作弊(同时增加了一些free_list
/used_list
地址集的开销)。
概括
是的,标准pool
/simple_segregated_storage
实现在这种负载下表现不佳。这是否真的是一个我无法确定的错误,但它确实看起来像它,通过文档(你也提到过)。