c++ - 四倍四过采样性能

Question

在制作一个从根本上依赖四乘四过采样的渲染引擎的过程中，我遇到了缩减本身的性能问题。

#include <stdint.h>

    const int_fast32_t sRGBtolinear[256] = {0, 20, 40, 60, 80, 99, 119, 139, 159, 179, 199, 219, 241, 264, 288, 313, 340, 367, 396, 427, 458, 491, 526, 562, 599, 637, 677, 718, 761, 805, 851, 898, 947, 997, 1048, 1101, 1156, 1212, 1270, 1330, 1391, 1453, 1517, 1583, 1651, 1720, 1791, 1863, 1937, 2013, 2090, 2170, 2250, 2333, 2418, 2504, 2592, 2681, 2773, 2866, 2961, 3058, 3157, 3258, 3360, 3464, 3570, 3678, 3788, 3900, 4014, 4129, 4247, 4366, 4488, 4611, 4736, 4864, 4993, 5124, 5257, 5392, 5530, 5669, 5810, 5953, 6099, 6246, 6395, 6547, 6701, 6856, 7014, 7174, 7336, 7500, 7666, 7834, 8004, 8177, 8352, 8529, 8708, 8889, 9072, 9258, 9446, 9636, 9828, 10022, 10219, 10418, 10619, 10822, 11028, 11236, 11446, 11658, 11873, 12090, 12309, 12531, 12754, 12981, 13209, 13440, 13673, 13909, 14147, 14387, 14629, 14874, 15122, 15372, 15624, 15878, 16135, 16394, 16656, 16920, 17187, 17456, 17727, 18001, 18278, 18556, 18838, 19121, 19408, 19696, 19988, 20281, 20578, 20876, 21178, 21481, 21788, 22096, 22408, 22722, 23038, 23357, 23679, 24003, 24329, 24659, 24991, 25325, 25662, 26002, 26344, 26689, 27036, 27387, 27739, 28095, 28453, 28813, 29177, 29543, 29911, 30283, 30657, 31033, 31413, 31795, 32180, 32567, 32957, 33350, 33746, 34144, 34545, 34949, 35355, 35765, 36177, 36591, 37009, 37429, 37852, 38278, 38707, 39138, 39572, 40009, 40449, 40892, 41337, 41786, 42237, 42691, 43147, 43607, 44069, 44534, 45003, 45474, 45947, 46424, 46904, 47386, 47871, 48360, 48851, 49345, 49842, 50342, 50844, 51350, 51859, 52370, 52884, 53402, 53922, 54445, 54972, 55501, 56033, 56568, 57106, 57647, 58191, 58738, 59288, 59841, 60397, 60956, 61518, 62083, 62651, 63222, 63796, 64373, 64953, 65536};
    const int_fast32_t lineartosRGBthr[256] = {0, 10, 30, 50, 70, 90, 110, 130, 150, 170, 189, 209, 230, 253, 276, 301, 327, 354, 382, 412, 443, 475, 509, 544, 580, 618, 657, 698, 740, 783, 828, 875, 923, 972, 1023, 1075, 1129, 1185, 1242, 1300, 1360, 1422, 1486, 1551, 1617, 1685, 1755, 1827, 1900, 1975, 2052, 2130, 2210, 2292, 2376, 2461, 2548, 2637, 2727, 2820, 2914, 3010, 3108, 3208, 3309, 3412, 3518, 3625, 3734, 3844, 3957, 4072, 4188, 4307, 4427, 4550, 4674, 4800, 4929, 5059, 5191, 5325, 5461, 5600, 5740, 5882, 6026, 6172, 6321, 6471, 6624, 6779, 6935, 7094, 7255, 7418, 7583, 7750, 7920, 8091, 8265, 8440, 8618, 8798, 8981, 9165, 9352, 9541, 9732, 9925, 10121, 10318, 10518, 10721, 10925, 11132, 11341, 11552, 11766, 11981, 12200, 12420, 12643, 12868, 13095, 13325, 13557, 13791, 14028, 14267, 14508, 14752, 14998, 15247, 15498, 15751, 16007, 16265, 16525, 16788, 17054, 17322, 17592, 17864, 18140, 18417, 18697, 18980, 19265, 19552, 19842, 20135, 20430, 20727, 21027, 21330, 21635, 21942, 22252, 22565, 22880, 23198, 23518, 23841, 24166, 24494, 24825, 25158, 25494, 25832, 26173, 26517, 26863, 27212, 27563, 27917, 28274, 28633, 28995, 29360, 29727, 30097, 30470, 30845, 31223, 31604, 31987, 32373, 32762, 33154, 33548, 33945, 34345, 34747, 35152, 35560, 35971, 36384, 36800, 37219, 37641, 38065, 38493, 38923, 39355, 39791, 40229, 40671, 41115, 41562, 42011, 42464, 42919, 43377, 43838, 44302, 44769, 45238, 45711, 46186, 46664, 47145, 47629, 48116, 48605, 49098, 49593, 50092, 50593, 51097, 51604, 52114, 52627, 53143, 53662, 54184, 54709, 55236, 55767, 56300, 56837, 57377, 57919, 58465, 59013, 59564, 60119, 60676, 61237, 61800, 62367, 62936, 63509, 64084, 64663, 65245};

 uint_least8_t lineartosRGB(int32_t value){
    uint_least8_t a = 0;
    if(lineartosRGBthr[a+128] <= value) a+=128;
    if(lineartosRGBthr[a+ 64] <= value) a+= 64;
    if(lineartosRGBthr[a+ 32] <= value) a+= 32;
    if(lineartosRGBthr[a+ 16] <= value) a+= 16;
    if(lineartosRGBthr[a+  8] <= value) a+=  8;
    if(lineartosRGBthr[a+  4] <= value) a+=  4;
    if(lineartosRGBthr[a+  2] <= value) a+=  2;
    if(lineartosRGBthr[a+  1] <= value) a+=  1;
    return a;
 }

 uint32_t RGBavg16(const uint32_t* pixel){
    int_fast32_t red = 0;
    int_fast32_t green = 0;
    int_fast32_t blue = 0;
    for(int_fast16_t i=0; i<16; i++){
        red   += sRGBtolinear[(pixel[i]>>16)&0xFF];
        green += sRGBtolinear[(pixel[i]>> 8)&0xFF];
        blue  += sRGBtolinear[(pixel[i]    )&0xFF];
    }
    return lineartosRGB((red+8)>>4)*65536+lineartosRGB((green+8)>>4)*256+lineartosRGB((blue+8)>>4)*1;
 }

void fourtimesfouroversampling(int* stagesize, uint32_t* pixels, int pixelsscanlineoffset, uint32_t* oversampled, int oversampledscanlineoffset){
        for(int i=0; i<stagesize[1]; i++){
        for(int j=0; j<stagesize[0]; j++){
            uint32_t pixel[16];
            for(int k=0; k<4; k++){
                for(int l=0; l<4; l++){
                    pixel[k*4+l] = oversampled[i*4*oversampledscanlineoffset+j*4+l+k*oversampledscanlineoffset];
                }
            }
            pixels[i*pixelsscanlineoffset+j] = RGBavg16(pixel);
        }
    }
}

四倍四过采样是一种通过渲染与不使用抗锯齿或平滑方法（二层轮廓渲染、最近邻图形等）完全相同的方式来实现抗锯齿的方法，但过采样是四倍于四倍大的过采样阶段。然后使用箱形滤波器通过取每个 16 个像素正方形的线性平均值来缩小每个像素。sRGB/线性转换是必需的，因为 sRGB 值不能直接平均，因为它们不是线性比例。

要测试性能，请在前面的代码之后使用以下主要代码，该代码会绘制随机过采样像素并缩小它们：

#include <stdlib.h>
#include <time.h>
    const int width = 640;
    const int height = 480;
    int stagesize[2] = {width, height};
    uint32_t pixels[width*height];
    uint32_t oversampled[width*4*height*4];
int main(){
    for(int i=0; i<width*4*height*4; i++) oversampled[i]=0;
    srand(time(NULL));
    for(int i=0; i<60; i++){
        for(int j=0; j<4096; j++){
            oversampled[rand()%(height*4)*(width*4)+rand()%(width*4)] = rand();
        }
        fourtimesfouroversampling(stagesize, pixels, width, oversampled, width*4);
    }
    return 0;
}

当 -O3 编译时，它最终平均约为 3.731 秒。由于代码无法在一秒内渲染60帧，因此无法维持60fps的渲染，使用该渲染器的60fps程序也无法全速运行。应该怎么做才能让四乘四过采样维持60fps？

score 0 · Accepted Answer

有几个原因导致这比必要的慢得多。首先，您如何测量速度？您main()看起来好像只运行了 60 帧，这可能不足以在您优化后获得准确的测量结果fourtimesfouroversampling()。您还想只测量花费在中的时间fourtimesfouroversampling()，而不是设置代码中的时间，而不是填充oversampled[]随机值的循环，而不是缓存必须预热的任何时间等等。

您执行的二进制搜索lineartosRGB()可能会非常慢。如果 CPU 支持条件移动指令，那么它还不算太糟糕，但是你仍然有 7 个间接加载，并且由于下一次加载取决于之前加载的值，因此没有办法有效地流水线化。使用预先计算的 65545 条目查找表可能更快。

另一种可能性是避免使用查找表，而是使用浮点数学。这听起来很疯狂，但好处是您可以使用 SSE 指令一次性处理多个像素。看看这个问题如何优化pow()。

另外，我会避免使用临时数组pixel[16]，而只是RGBavg16()与fourtimesfouroversampling().

有关上面提到的一些更改，请参阅godbolt.org 上的此示例，但浮点数学除外。无论使用哪个编译器，它们都会展开内部的两个循环。每个像素都是单独处理的，因为很遗憾不能使用 SSE 指令并行完成表查找。

c++ - 四倍四过采样性能

1 回答 1

Related

Reference