为了教育目的,我修改了我不久前编写的光线跟踪器,以利用 OpenMP 的多处理。但是,我没有看到并行化有任何好处。
我尝试了 3 种不同的方法:任务池环境(draw_pooled()
函数)、for
具有图像行级并行性的标准 OMP 并行嵌套循环(draw_parallel_for()
),以及另一个for
具有像素级并行性的 OMP(draw_parallel_for2()
)。原始的连续绘图例程也包括在内以供参考(draw_serial()
)。
我在 Intel Core 2 Duo E6750(2 个内核 @ 2,67GHz,每个带超线程)和 Linux 下的 4GB RAM 上运行 2560x1920 渲染,由 gcc 和 libgomp 编译的二进制文件。场景取平均值:
- 连续渲染 120 秒,
- 但是无论我选择上述三种特定方法中的哪一种,都需要 196 秒(原文如此!)在 2 个线程中并行执行(默认值 - CPU 内核数),
- 如果我用 4 覆盖 OMP 的默认线程数以考虑 HT,则并行渲染时间下降到 177 秒。
为什么会这样?我在并行代码中看不到任何明显的瓶颈。
编辑:只是为了澄清-任务池只是实现之一,请阅读问题-向下滚动以查看并行for
。问题是,它们和任务池一样慢!
void draw_parallel_for(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
for (int y = 0; y < h; ++y) {
#pragma omp parallel for num_threads(4)
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
void draw_parallel_for2(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
int x, y;
#pragma omp parallel for private(x, y) num_threads(4)
for (int xy = 0; xy < w * h; ++xy) {
x = xy % w;
y = xy / w;
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
void draw_parallel_for3(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
#pragma omp parallel for num_threads(4)
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
void draw_serial(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
Scene::GetInstance().RenderPixel(x, y, buf + (y * w + x) * 3);
}
write_png(buf, w, h, fname);
delete [] buf;
}
std::queue< std::pair<int, int> * > task_queue;
void draw_pooled(int w, int h, const char *fname) {
unsigned char *buf;
buf = new unsigned char[w * h * 3];
Scene::GetInstance().PrepareRender(w, h);
bool tasks_issued = false;
#pragma omp parallel shared(buf, tasks_issued, w, h) num_threads(4)
{
#pragma omp master
{
for (int y = 0; y < h; ++y) {
for (int x = 0; x < w; ++x)
task_queue.push(new std::pair<int, int>(x, y));
}
tasks_issued = true;
}
while (true) {
std::pair<int, int> *coords;
#pragma omp critical(task_fetch)
{
if (task_queue.size() > 0) {
coords = task_queue.front();
task_queue.pop();
} else
coords = NULL;
}
if (coords != NULL) {
Scene::GetInstance().RenderPixel(coords->first, coords->second,
buf + (coords->second * w + coords->first) * 3);
delete coords;
} else {
#pragma omp flush(tasks_issued)
if (tasks_issued)
break;
}
}
}
write_png(buf, w, h, fname);
delete [] buf;
}