2

这是代码。我在 2015 年 8 月 5 日的 Win64 主干 VS2013 上使用 Halide。当我执行 diag.compile_to_lowered_stmt("diag.html", {}, HTML) 时,我在 halide.dll 中遇到堆栈溢出。

Image<uint8_t> orig_uint = Tools::load_image("../foo.ppm");

Var x, y, c;
Func orig("orig"), orig_lum("orig_lum"), m45("m45"), m135("m135"), f45("f45"), f135("f135"), f4x4_horiz("f4x4_horiz"), f4x4("f4x4"), diag("diag");

Func orig_clamped = BoundaryConditions::repeat_edge(orig_uint);

const float wta = 1.0f, wtb = 3.0f, wt0 = wta * wta, wt1 = wta * wtb, wt2 = wtb * wtb;

orig(x, y, c) = cast<float_t>(orig_clamped(x, y, c));

orig_lum(x, y) = 0.299f * orig(x, y, 0) + 0.587f * orig(x, y, 1) + 0.114f * orig(x, y, 2);

m45(x, y) = abs(orig_lum(x - 1, y - 1) - orig_lum(x, y)) + abs(orig_lum(x, y) - orig_lum(x + 1, y + 1)) + abs(orig_lum(x + 1, y + 1) - orig_lum(x + 2, y + 2));

m135(x, y) = abs(orig_lum(x + 2, y - 1) - orig_lum(x + 1, y)) + abs(orig_lum(x + 1, y) - orig_lum(x, y + 1)) + abs(orig_lum(x, y + 1) - orig_lum(x - 1, y + 2));

f45(x, y, c) = wta * (orig(x - 1, y - 1, c) + orig(x + 2, y + 2, c)) + wtb * (orig(x, y, c) + orig(x + 1, y + 1, c));

f135(x, y, c) = wta * (orig(x - 1, y + 2, c) + orig(x + 2, y - 1, c)) + wtb * (orig(x, y + 1, c) + orig(x + 1, y, c));

f4x4_horiz(x, y, c) = wta * (orig(x - 1, y, c) + orig(x + 2, y, c)) + wtb * (orig(x, y, c) + orig(x + 1, y, c));

f4x4(x, y, c) = wta * (f4x4_horiz(x, y - 1, c) + f4x4_horiz(x, y + 2, c)) + wtb * (f4x4_horiz(x, y, c) + f4x4_horiz(x, y + 1, c));

diag(x, y, c) = select(m135(x, y) > m45(x, y), f45(x, y, c), select(m45(x, y) > m135(x, y), f135(x, y, c), f4x4(x, y, c)));

// schedule
orig_lum.compute_root();
m45.compute_root().bound(x, 0, orig_uint.width()).bound(y, 0, orig_uint.height());
m135.compute_root().bound(x, 0, orig_uint.width()).bound(y, 0, orig_uint.height());
f45.compute_at(diag, x);
f135.compute_at(diag, x);
f4x4.compute_at(diag, x);
diag.compute_root();

// compile so we can take a look at the code
diag.compile_to_lowered_stmt("diag.html", {}, HTML);    // stack oflo here

有任何想法吗?如果你想提供一个改进的时间表,我也很乐意接受——我只是想先运行一些基本的东西。

(我在 diag.compute_root() 之后添加了一个绑定对,但这似乎没有帮助。我确实想最终限制 diag 系数。)

4

1 回答 1

4

默认情况下,msvc 使用相当小的堆栈(1 MB),并且许多 Halide 编译器传递都执行深度递归的东西。我只是将堆栈大小增加到 8 兆字节(例如添加编译标志 /STACK:8388608,1048576)。这就是我们为 Windows 上的卤化物测试所做的。

于 2015-08-24T20:52:51.453 回答