1

我正在使用 OpenGL 实例化方法进行测试,并测试了 glDrawArraysInstanced 和 glDrawElementsInstanced 函数,其余代码几乎相同。测试是渲染 100 万个可以旋转和平移的彩色方块(缩放为像素并适合屏幕),并查看 FPS 差异。我得到的结果是:

  • 使用 glDrawArraysInstanced:36-39 FPS
  • 使用 glDrawElementsInstanced:24-28 FPS

根据我的研究,我没有关于哪个函数应该更快的结论,不过我更喜欢使用 glDrawElementsInstanced。

所以我的问题是,在这种情况下(或一般情况下)是否有改进 glDrawElementsInstanced 的方法,以便用它进行绘图与 glDrawArraysInstanced 相同或更快。这是我用于 glDrawArraysInstanced 的代码:

int main(int argc, char** argv) {

int width = 1000, height = 600;
SDLWindow window(width, height, "window");
const char* glsl_version = "#version 330";

srand((unsigned int)time(NULL));

Shader shader("shaders/shader.vert", "shaders/shader.frag");

float quadVertices[] = {
    // positions     
    -1.0f,  1.0f,
     1.0f, -1.0f,
    -1.0f, -1.0f,

    -1.0f,  1.0f,
     1.0f, -1.0f,
     1.0f,  1.0f
};

float colors[] = {
    1.0f, 0.0f, 0.0f,
    0.0f, 1.0f, 0.0f,
    0.0f, 0.0f, 1.0f,

    1.0f, 0.0f, 0.0f,
    0.0f, 1.0f, 0.0f,
    0.0f, 1.0f, 1.0f
};

// Gera VAO e buffers
GLuint VBO, VAO, TRANSFORM;

unsigned int amount = 1000000;

glm::mat4 *modelMatrices;
modelMatrices = new glm::mat4[amount];

GLsizei vec4Size = sizeof(glm::vec4);

glGenVertexArrays(1, &VAO);
glGenBuffers(1, &VBO);
glGenBuffers(1, &TRANSFORM);

glBindVertexArray(VAO);

// configuração de VBO
glBindBuffer(GL_ARRAY_BUFFER, VBO);
glBufferData(GL_ARRAY_BUFFER, sizeof(quadVertices) + sizeof(colors), nullptr, GL_STREAM_DRAW);
glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(quadVertices), quadVertices);
glBufferSubData(GL_ARRAY_BUFFER, sizeof(quadVertices), sizeof(colors), colors);

glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 2 * sizeof(float), (void*)0);
glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(float), (void*)(sizeof(quadVertices)));

glEnableVertexAttribArray(0);
glEnableVertexAttribArray(1);

// configuração de TRANSFORM
glBindBuffer(GL_ARRAY_BUFFER, TRANSFORM);
glBufferData(GL_ARRAY_BUFFER, amount * sizeof(glm::vec2) + amount * sizeof(glm::vec2) + amount * sizeof(glm::vec4), nullptr, GL_STREAM_DRAW);
glVertexAttribPointer(2, 2, GL_FLOAT, GL_FALSE, sizeof(glm::vec2), (void*)0);
glVertexAttribPointer(3, 2, GL_FLOAT, GL_FALSE, sizeof(glm::vec2), (void*)(sizeof(glm::vec2) * amount));
glVertexAttribPointer(4, 4, GL_FLOAT, GL_FALSE, sizeof(glm::vec4), (void*)(sizeof(glm::vec2) * amount + sizeof(glm::vec2) * amount));

glEnableVertexAttribArray(2);
glEnableVertexAttribArray(3);
glEnableVertexAttribArray(4);
glVertexAttribDivisor(2, 1);
glVertexAttribDivisor(3, 1);
glVertexAttribDivisor(4, 1);

glBindVertexArray(0);

MathUtils mu;

vector <float> randNumbers;

float amountSqrt = glm::sqrt(amount);

for (int i = 0; i < amount; i++)
{
    randNumbers.push_back(mu.GenRandFloat(-1.0f, 1.0f) * amountSqrt);
}
glm::vec2 *scale = new glm::vec2[amount];
glm::vec2 *translate = new glm::vec2[amount];
glm::vec4 *rotate = new glm::vec4[amount];

void *transformBfrPtr = nullptr;

bool quit = 0;
while (!quit)
{
    glClearColor(1.0, 1.0, 1.0, 1.0);
    glClear(GL_COLOR_BUFFER_BIT);

    shader.use();

    glBindVertexArray(VAO);

    glBindBuffer(GL_ARRAY_BUFFER, TRANSFORM);
    transformBfrPtr = glMapBufferRange(GL_ARRAY_BUFFER, 0, sizeof(glm::vec2) * amount, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
    for (int i = 0; i < amount; i++)
    {
        scale[i][0] = 1.0f / amountSqrt;
        scale[i][1] = 1.0f / amountSqrt;
    }

    std::memcpy(transformBfrPtr, scale, sizeof(scale[0]) * amount);
    glUnmapBuffer(GL_ARRAY_BUFFER);


    transformBfrPtr = glMapBufferRange(GL_ARRAY_BUFFER, sizeof(glm::vec2) * amount, sizeof(glm::vec2) * amount, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);

    for (int i = 0; i < amount; i++)
    {
        translate[i][0] = randNumbers[i];
        translate[i][1] = randNumbers[amount - i];
    }

    std::memcpy(transformBfrPtr, translate, sizeof(translate[0]) * amount);
    glUnmapBuffer(GL_ARRAY_BUFFER);


    transformBfrPtr = glMapBufferRange(GL_ARRAY_BUFFER, sizeof(glm::vec2) * amount + sizeof(glm::vec2) * amount, sizeof(glm::vec4) * amount, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);

    float var = SDL_GetTicks() / 1000.0f;
    for (int i = 0; i < amount; i++)
    {
        rotate[i][0] = cos(var);
        rotate[i][1] = sin(var);
        rotate[i][2] = -sin(var);
        rotate[i][3] = cos(var);
    }

    std::memcpy(transformBfrPtr, rotate, sizeof(rotate[0]) * amount);
    glUnmapBuffer(GL_ARRAY_BUFFER);

    glDrawArraysInstanced(GL_TRIANGLES, 0, 6, amount);

    switch (window.SDL_ManageEvent())
    {
    case SDL_QUIT:
        SDL_Quit();
        glDeleteBuffers(1, &VBO);
        glDeleteVertexArrays(1, &VAO);
        break;
    }

    window.SDL_SwapWindow();
}


return 0;
}

以及使用 glDrawElementsInstanced 的代码(几乎相同,但带有索引):

int main(int argc, char** argv) {

int width = 1000, height = 600;
SDLWindow window(width, height, "window");
const char* glsl_version = "#version 330";

srand((unsigned int)time(NULL));

Shader shader("shaders/shader.vert", "shaders/shader.frag");

float quadVertices[] = {
    // positions     
    -1.0f,  1.0f,
     1.0f, -1.0f,
    -1.0f, -1.0f,
     1.0f,  1.0f
};

float colors[] = {
    1.0f, 0.0f, 0.0f,
    0.0f, 1.0f, 0.0f,
    0.0f, 0.0f, 1.0f,
    0.0f, 1.0f, 1.0f
};

unsigned int indices[] = {
    0, 1, 2,
    0, 1, 3
};

// Gera VAO e buffers
GLuint VBO, VAO, TRANSFORM, EBO;

unsigned int amount = 1000000;

glm::mat4 *modelMatrices;
modelMatrices = new glm::mat4[amount];

GLsizei vec4Size = sizeof(glm::vec4);

glGenVertexArrays(1, &VAO);
glGenBuffers(1, &VBO);
glGenBuffers(1, &TRANSFORM);
glGenBuffers(1, &EBO);

glBindVertexArray(VAO);

// configuração de VBO
glBindBuffer(GL_ARRAY_BUFFER, VBO);
glBufferData(GL_ARRAY_BUFFER, sizeof(quadVertices) + sizeof(colors), nullptr, GL_STREAM_DRAW);
glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(quadVertices), quadVertices);
glBufferSubData(GL_ARRAY_BUFFER, sizeof(quadVertices), sizeof(colors), colors);

glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 2 * sizeof(float), (void*)0);
glVertexAttribPointer(1, 3, GL_FLOAT, GL_FALSE, 3 * sizeof(float), (void*)(sizeof(quadVertices)));

glEnableVertexAttribArray(0);
glEnableVertexAttribArray(1);

// configuração de EBO
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, EBO);
glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STREAM_DRAW);

// configuração de TRANSFORM
glBindBuffer(GL_ARRAY_BUFFER, TRANSFORM);
glBufferData(GL_ARRAY_BUFFER, amount * sizeof(glm::vec2) + amount * sizeof(glm::vec2) + amount * sizeof(glm::vec4), nullptr, GL_STREAM_DRAW);
glVertexAttribPointer(2, 2, GL_FLOAT, GL_FALSE, sizeof(glm::vec2), (void*)0);
glVertexAttribPointer(3, 2, GL_FLOAT, GL_FALSE, sizeof(glm::vec2), (void*)(sizeof(glm::vec2) * amount));
glVertexAttribPointer(4, 4, GL_FLOAT, GL_FALSE, sizeof(glm::vec4), (void*)(sizeof(glm::vec2) * amount + sizeof(glm::vec2) * amount));

glEnableVertexAttribArray(2);
glEnableVertexAttribArray(3);
glEnableVertexAttribArray(4);
glVertexAttribDivisor(2, 1);
glVertexAttribDivisor(3, 1);
glVertexAttribDivisor(4, 1);

glBindVertexArray(0);

glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);

MathUtils mu;

vector <float> randNumbers;

float amountSqrt = glm::sqrt(amount);

for (int i = 0; i < amount; i++)
{
    randNumbers.push_back(mu.GenRandFloat(-1.0f, 1.0f) * amountSqrt);
}
glm::vec2 *scale = new glm::vec2[amount];
glm::vec2 *translate = new glm::vec2[amount];
glm::vec4 *rotate = new glm::vec4[amount];

void *transformBfrPtr = nullptr;

bool quit = 0;
while (!quit)
{
    glClearColor(1.0, 1.0, 1.0, 1.0);
    glClear(GL_COLOR_BUFFER_BIT);

    shader.use();

    glBindVertexArray(VAO);

    glBindBuffer(GL_ARRAY_BUFFER, TRANSFORM);
    transformBfrPtr = glMapBufferRange(GL_ARRAY_BUFFER, 0, sizeof(glm::vec2) * amount, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
    for (int i = 0; i < amount; i++)
    {
        scale[i][0] = 1.0f / amountSqrt;
        scale[i][1] = 1.0f / amountSqrt;
    }

    std::memcpy(transformBfrPtr, scale, sizeof(scale[0]) * amount);
    glUnmapBuffer(GL_ARRAY_BUFFER);


    transformBfrPtr = glMapBufferRange(GL_ARRAY_BUFFER, sizeof(glm::vec2) * amount, sizeof(glm::vec2) * amount, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);

    for (int i = 0; i < amount; i++)
    {
        translate[i][0] = randNumbers[i];
        translate[i][1] = randNumbers[amount - i];
    }

    std::memcpy(transformBfrPtr, translate, sizeof(translate[0]) * amount);
    glUnmapBuffer(GL_ARRAY_BUFFER);


    transformBfrPtr = glMapBufferRange(GL_ARRAY_BUFFER, sizeof(glm::vec2) * amount + sizeof(glm::vec2) * amount, sizeof(glm::vec4) * amount, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT);

    float var = SDL_GetTicks() / 1000.0f;
    for (int i = 0; i < amount; i++)
    {
        rotate[i][0] = cos(var);
        rotate[i][1] = sin(var);
        rotate[i][2] = -sin(var);
        rotate[i][3] = cos(var);
    }

    std::memcpy(transformBfrPtr, rotate, sizeof(rotate[0]) * amount);
    glUnmapBuffer(GL_ARRAY_BUFFER);

    glDrawElementsInstanced(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0, amount);

    switch (window.SDL_ManageEvent())
    {
    case SDL_QUIT:
        SDL_Quit();
        glDeleteBuffers(1, &VBO);
        glDeleteVertexArrays(1, &VAO);
        break;
    }

    window.SDL_SwapWindow();
}


return 0;
}

顶点着色器代码(如果有帮助):

#version 330 core
layout (location = 0) in vec2 aPos;
layout (location = 1) in vec3 aColor;
layout (location = 2) in vec2 scale;
layout (location = 3) in vec2 translation;
layout (location = 4) in vec4 rotation;

out vec3 color;

void main()
{
    mat2 rotation = mat2(rotation.xy, rotation.zw);
    gl_Position =  vec4(((aPos * rotation + translation) *  scale) , 0.0, 1.0);

    color = aColor;
}  

提前致谢。

更新: 好的,看来我有一个与此无关的问题。我在另一台机器上测试了这两个代码,结果让我感到惊讶。首先是一些背景:上面的测试是在这个(PC)设置上进行的:16GB ram,i7 4790k,GTX 970。

然后我用这个设置在我的笔记本电脑上做了同样的测试:8GB ram、i5 8250u、Geforce MX150 (2GB)

结果:

  • 使用 glDrawArraysInstanced:73-87 FPS
  • 使用 glDrawElementsInstanced:74-87 FPS

我还注意到,在我的笔记本电脑中,GPU 使用率一直在 99% 左右,而在我的 PC 中,使用 glDrawElementsInstanced 和 glDrawArraysInstanced 大约 45% 和 60% 左右摆动。在这两种情况下,所有内核的 CPU 使用率都低于 60%(因此 CPU 瓶颈是毫无疑问的)。我尝试卸载 GPU 驱动程序并使用 DDU 重新安装,但没有成功。我还检查了两个系统中的 Nvidia 控制面板,设置完全相同。我在两台机器上都安装了 Windows 10,并且它们在 Windows 控制面板上设置为最高性能。我最初的问题得到了部分回答,因为我在不同的机器上有非常不同的 FPS 结果(在绘制调用函数本身之间)。怎么可能有这样的差异?欢迎任何建议。

4

0 回答 0