c - 内存是矩阵加法（SIMD 指令）的瓶颈吗？

Question

我正在尝试使用 SIMD 指令（_mm256_add_pd、存储、加载等）优化 C 中的二维矩阵加法。但是，我根本没有看到很大的加速。使用一些时序代码，我看到在 0.8x-1.5x 范围内的加速是天真的解决方案）。我想知道这是否是典型的？我在想这可能是一个内存瓶颈，因为在这种情况下计算似乎很少。我相信这应该会给我大约 4 倍的速度提升，因为我一次要进行 4 次加法，所以我不完全确定瓶颈是什么。

我编写了一些代码来演示我在做什么（测试并行 + SIMD 与仅 SIMD）：

#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <time.h>
#include <omp.h>
#include <string.h>

#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#include <immintrin.h>
#include <x86intrin.h>
#endif

void add_matrix_naive (double **result, double **mat1, double **mat2, int rows, int cols) {
    int simdCols = cols / 4 * 4;
        if(simdCols > 0){
            for(unsigned int i = 0; i < rows; i++){
                for(unsigned int j = 0; j < simdCols; j += 4){
                    _mm256_storeu_pd(result[i] + j, _mm256_add_pd(
                        _mm256_loadu_pd(mat1[i] + j)
                        , _mm256_loadu_pd(mat2[i] + j)));
                }
            }
        }

        //Handle extra columns
        if(simdCols < cols){
            for(unsigned int i = 0; i < rows; i++){ 
                for(unsigned int j = simdCols; j < cols; j++){
                    result[i][j] = mat1[i][j] + mat2[i][j];
                }
            }
        }
}

void add_matrix(double **result, double **mat1, double **mat2, int rows, int cols) {
    int simdCols = cols / 4 * 4;
    #pragma omp parallel if (rows*cols >= 2000)
    {
        if(simdCols > 0){
            #pragma omp for collapse(2)
            for(unsigned int i = 0; i < rows; i++){
                for(unsigned int j = 0; j < simdCols; j += 4){
                    _mm256_storeu_pd(result[i] + j, _mm256_add_pd(
                        _mm256_loadu_pd(mat1[i] + j)
                        , _mm256_loadu_pd(mat2[i] + j)));
                }
            }
        }

        //Handle extra columns
        if(simdCols < cols){
            #pragma omp for collapse(2)
            for(unsigned int i = 0; i < rows; i++){ 
                for(unsigned int j = simdCols; j < cols; j++){
                    result[i][j] = mat1[i][j] + mat2[i][j];
                }
            }
        }

    }
}

int main() 
{ 
    omp_set_num_threads(8);
    //Allocate Matrices
    int rows = 200;
    int cols = 200;

    double **matrix_a = malloc(rows * sizeof(double *) + rows*cols*sizeof(double));

    double * dataStart = (double *) matrix_a + rows; //Offset row pointers
    for(unsigned int i = 0; i < rows; i++){
        matrix_a[i] = dataStart + i * cols;
        memset(matrix_a[i], 0, sizeof(double) * cols);
    }

    double **matrix_b = malloc(rows * sizeof(double *) + rows*cols*sizeof(double));

    dataStart = (double *) matrix_b + rows; //Offset row pointers
    for(unsigned int i = 0; i < rows; i++){
        matrix_b[i] = dataStart + i * cols;
        memset(matrix_b[i], 0, sizeof(double) * cols);
    }

    double **result = malloc(rows * sizeof(double *) + rows*cols*sizeof(double));

    dataStart = (double *) result + rows; //Offset row pointers
    for(unsigned int i = 0; i < rows; i++){
        result[i] = dataStart + i * cols;
        memset(result[i], 0, sizeof(double) * cols);
    }

    //Assign random values to matrices.
    for(int i = 0; i < rows; i++){
        for(int j = 0; j < cols; j++){
            matrix_a[i][j] = rand();
            matrix_b[i][j] = rand();
        }
    }

    
    int LOOP_COUNT = 4;

    double prevTime = omp_get_wtime();
    for(int i = 0; i < LOOP_COUNT; i++){
        add_matrix(result, matrix_a, matrix_b, rows, cols);
        
    }
    double endTime = omp_get_wtime();
    double firstTime = (endTime - prevTime)/LOOP_COUNT;
    printf("Took %f Seconds\n", firstTime);

    //Assign random values to matrices.
    for(int i = 0; i < rows; i++){
        for(int j = 0; j < cols; j++){
            matrix_a[i][j] = rand();
            matrix_b[i][j] = rand();
        }
    }

    prevTime = omp_get_wtime();
    for(int i = 0; i < LOOP_COUNT; i++){
        add_matrix_naive(result, matrix_a, matrix_b, rows, cols);
    }
    endTime = omp_get_wtime();
    double secondTime = (endTime - prevTime)/LOOP_COUNT;
    printf("Took %f Seconds\n", secondTime);
    printf("Naive Time: %f Faster\n", firstTime/secondTime);
}

我注意到结果似乎完全取决于 LOOP_COUNT。高循环数的并行/SIMD 版本做得很好，但低循环数的幼稚解决方案往往会做得更好。

score 1 · Accepted Answer

CPU 可以比访问内存更快地计算事物。

添加双打非常快。您的核心很可能在内存上遇到瓶颈。

还有更多。

omp_set_num_threads

很少有好主意。默认值通常很好。

双**结果，双**mat1，双**mat2

双指针意味着访问它们的双倍 RAM 延迟。使用单个指针。如果要使用矩阵的切片，只需在矩阵的行之间传递另一个带有偏移量的参数。

但最重要的是，您的基准测试完全错误。

要比较算法的不同实现，您必须编译 2 个不同的程序。当您在单个程序中运行这两种实现时，会发生许多有趣的事情：缓存层次结构启动，CPU 有另一个用于解码指令的专用缓存，现代 CPU 在热节流方面非常快，等等。
编译器并不愚蠢。当他们意识到您在不使用结果的情况下计算某些东西时，他们可以删除代码。当他们意识到您在不更改输入数据的情况下多次计算某些内容时，他们可以消除冗余代码并且只计算一次。

c - 内存是矩阵加法（SIMD 指令）的瓶颈吗？

1 回答 1

Related

Reference