2

我试图使用 OpenMP 并行化高斯模糊函数,但我是 OpenMP 的新手,当我尝试并行化两个 for 循环时(我认为没有任何变量需要为每个线程私有),它最终运行比以前更慢,并且输出不同。那我做错什么了吗?我应该怎么做才能让它运行得更快?

void gaussian_blur(float **src, float **dst, int w, int h, float sigma)
{
    int x, y, i;
    int ksize = (int)(sigma * 2.f * 4.f + 1) | 1;
    int halfk = ksize / 2;
    float scale = -0.5f/(sigma*sigma);
    float sum = 0.f;
    float *kernel, *ringbuf;
    int xmax = w - halfk;
    int ymax = h - halfk;

    // if sigma too small, just copy src to dst
    if (ksize <= 1)
    {
        for (y = 0; y < h; y++)
            for (x = 0; x < w; x++)
                dst[y][x] = src[y][x];
        return;
    }

    // create Gaussian kernel
    kernel = malloc(ksize * sizeof(float));
    ringbuf = malloc(ksize * sizeof(float));

    #pragma omp parallel for reduction(+ : sum)
    for (i = 0; i < ksize; i++)
    {
        float x = (float)(i - halfk);
        float t = expf(scale * x * x);
        kernel[i] = t;
        sum += t;
    }

    scale = 1.f / sum;
    #pragma omp parallel for
    for (i = 0; i < ksize; i++)
        kernel[i] *= scale;

    // blur each row
    #pragma omp parallel for // this is the for loop I parallelized but ended up with wrong output and running slower
    for (y = 0; y < h; y++)
    {
        int x1;
        int bufi0 = ksize-1;
        float tmp = src[y][0];

        for (x1 = 0; x1 < halfk  ; x1++) ringbuf[x1] = tmp;
        for (; x1 < ksize-1; x1++) ringbuf[x1] = src[y][x1-halfk];

        for (x1 = 0; x1 < w; x1++)
        {
            if(x1 < xmax)
                ringbuf[bufi0++] = src[y][x1+halfk];
            else
                ringbuf[bufi0++] = src[y][w-1];
            if (bufi0 == ksize) bufi0 = 0;
            dst[y][x1] = convolve(kernel, ringbuf, ksize, bufi0);
        }
    }

    // blur each column
    #pragma omp parallel for // this is the for loop I parallelized but ended up with wrong output and running slower
    for (x = 0; x < w; x++)
    {
        int y1;
        int bufi0 = ksize-1;
        float tmp = dst[0][x];
        for (y1 = 0; y1 < halfk  ; y1++) ringbuf[y1] = tmp;
        for (     ; y1 < ksize-1; y1++) ringbuf[y1] = dst[y1-halfk][x];

        for (y1 = 0; y1 < h; y1++)
        {
            if(y1 < ymax)
                ringbuf[bufi0++] = dst[y1+halfk][x];
            else
                ringbuf[bufi0++] = dst[h-1][x];

            if (bufi0 == ksize) bufi0 = 0;
            dst[y1][x] = convolve(kernel, ringbuf, ksize, bufi0);
        }
    }

    // clean up
    free(kernel);
    free(ringbuf);
}
4

2 回答 2

2

除了需要正确识别私有和共享数据之外,您还可以做几件事来加快您的程序。

作为第一步,您应该删除任何不必要的并发。例如,平均有多大ksize?如果它少于几百个元素,那么使用 OpenMP 进行诸如计算内核然后对其进行规范化这样的简单操作是绝对没有意义的:

#pragma omp parallel for reduction(+ : sum)
for (i = 0; i < ksize; i++)
{
    float x = (float)(i - halfk);
    float t = expf(scale * x * x);
    kernel[i] = t;
    sum += t;
}

scale = 1.f / sum;
#pragma omp parallel for
for (i = 0; i < ksize; i++)
    kernel[i] *= scale;

在典型的现代 CPU 上,引导并行区域比在单个内核上计算需要更多的周期。同样在现代 CPU 上,这些循环可以展开和矢量化,并且您可以在单个内核上获得高达 8 倍的提升。如果内核太小,那么除了 OpenMP 开销之外,您还会因过多的错误共享而减慢速度。您必须确保每个线程都获得 16 个元素(64 字节的缓存行大小 / sizeof(float))的精确倍数来处理,以防止错误共享。

您还必须确保线程不共享列模糊部分中的缓存行。

// blur each column
#pragma omp parallel for
for (x = 0; x < w; x++)
{
    ...
    for (y1 = 0; y1 < h; y1++)
    {
        ...
        dst[y1][x] = convolve(kernel, ringbuf, ksize, bufi0);
    }
}

由于这里的访问模式,您必须确保每个线程获得的列块是 16 的倍数,否则16*y1每两个连续线程共享的像素边界重叠区域将出现过度错误共享。如果你不能保证它w可以被 16 整除,那么你可以给每个线程一个y方向上的起始偏移量,例如最里面的循环变成:

int tid = omp_get_thread_num();

for (y1 = 2*tid; y1 < h; y1++)
{
   ...
}
for (y1 = 0; y1 < 2*tid; y1++)
{
   ...
}

乘数 2 是任意的。这个想法是让下一个线程与当前线程相比提前几行,以便两个线程在任何时候都不会同时处理同一行。您还可以使用加法和模运算来计算y1,即

for (y2 = 0; y2 < h; y2++)
{
    y1 = (y2 + 2*tid) % h;
    ...
}

但这通常比将循环分成两部分要慢。

还要注意您的数据大小。末级高速缓存 (LLC) 的带宽非常高,但仍然有限。如果数据无法放入每个内核的私有缓存中,那么编译器优化(例如循环向量化)可能会给 LLC 带来非常大的压力。如果数据不适合 LLC,事情会变得更加糟糕,因此必须访问主存储器。

如果您不知道什么是虚假分享,Dr.Dobb's 中有一篇文章在这里进行了解释。

于 2013-05-20T11:24:23.950 回答
2

我可能已经修复了你的代码。你没有发布你的卷积函数,所以很难确定,但我不确定它是否重要。至少有两个错误。ringbuf数组中有竞争条件。为了解决这个问题,我将数组扩展为线程数。

ringbuf = (float*)malloc(nthreads*ksize * sizeof(float));

要访问数组,请执行以下操作

int ithread = omp_get_thread_num();
ringbuf[ksize*ithread + x1]

编辑:我添加了一些ringbuf在并行块内定义的代码。这样您就不必根据线程号访问 ringbuf。

第二个错误是ibufi0变量。我像这样定义了一个新的

const int ibufi0_fix = (x1+ksize-1)%ksize;

下面是我用来检查它的代码。替换为您的卷积函数。请注意,这可能仍然非常低效。可能存在缓存问题,例如缓存未命中和错误共享(尤其是在垂直卷积时)。不过,希望图像现在是正确的。

编辑:这是英特尔的一篇论文,展示了如何使用 AVX 做到这一点。它经过优化以最大限度地减少缓存未命中。我不确定它是否针对线程进行了优化。 http://software.intel.com/en-us/articles/iir-gaussian-blur-filter-implementation-using-intel-advanced-vector-extensions

我正在为此编写自己的函数(这实际上是我开始学习 OpenMP 的原因),它也使用 SSE/AVX。矩阵乘法和图像滤波有很多相似之处,所以我先学习了如何优化矩阵乘法,很快就会做高斯模糊......

#include "math.h"
#include "omp.h"
#include "stdio.h"
#include <nmmintrin.h>

float convolve(const float *kernel, const float *ringbuf, const int ksize, const int bufi0) {
    float sum = 0.0f;
    for(int i=0; i<ksize; i++) {
        sum += kernel[i]*ringbuf[i];
    }
    return sum;
}


void gaussian_blur(float *src, float *dst, int w, int h, float sigma, int nthreads)
{
    int x, y, i;
    int ksize = (int)(sigma * 2.f * 4.f + 1) | 1;
    int halfk = ksize / 2;
    printf("ksize %d\n", ksize);
    float scale = -0.5f/(sigma*sigma);
    float sum = 0.f;
    float *kernel, *ringbuf;
    int xmax = w - halfk;
    int ymax = h - halfk;

    // if sigma too small, just copy src to dst
    if (ksize <= 1)
    {
        for (y = 0; y < h; y++)
            for (x = 0; x < w; x++)
                dst[y*w + x] = src[y*w + x];
        return;
    }

    // create Gaussian kernel
    //kernel = malloc(ksize * sizeof(float));
    kernel =  (float*)_mm_malloc(ksize * sizeof(float),16);
    //ringbuf = malloc(ksize * sizeof(float));
    ringbuf = (float*)_mm_malloc(nthreads*ksize * sizeof(float),16);

    #pragma omp parallel for reduction(+ : sum) if(nthreads>1)
    for (i = 0; i < ksize; i++)
    {
        float x = (float)(i - halfk);
        float t = expf(scale * x * x);
        kernel[i] = t;
        sum += t;
    }

    scale = 1.f / sum;
    #pragma omp parallel for if(nthreads>1)
    for (i = 0; i < ksize; i++)
        kernel[i] *= scale;

    // blur each row
    #pragma omp parallel for if(nthreads>1)// this is the for loop I parallelized but ended up with wrong output and running slower
    for (y = 0; y < h; y++)
    {
        int ithread = omp_get_thread_num();
        //printf("nthread %d\n", nthread);
        int x1;
        int bufi0 = ksize-1;
        float tmp = src[y*w + 0];
        for (x1 = 0; x1 < halfk  ; x1++) ringbuf[ksize*ithread + x1] = tmp;
        for (; x1 < ksize-1; x1++) ringbuf[ksize*ithread + x1] = src[y*w + x1-halfk];
        for (x1 = 0; x1 < w; x1++)
        {
            const int ibufi0_fix = (x1+ksize-1)%ksize;

            if(x1 < xmax)
                ringbuf[ksize*ithread + ibufi0_fix] = src[y*w + x1+halfk];
            else
                ringbuf[ksize*ithread + ibufi0_fix] = src[y*w + w-1];
            if (bufi0 == ksize) bufi0 = 0;
            dst[y*w + x1] = convolve(kernel, &ringbuf[ksize*ithread], ksize, bufi0);
        }
    }
    // blur each column
    #pragma omp parallel for if(nthreads>1)// this is the for loop I parallelized but ended up with wrong output and running slower
    for (x = 0; x < w; x++)
    {
        int ithread = omp_get_thread_num();
        int y1;
        int bufi0 = ksize-1;
        float tmp = dst[0*w + x];
        for (y1 = 0; y1 < halfk  ; y1++) ringbuf[ksize*ithread + y1] = tmp;
        for (     ; y1 < ksize-1; y1++) ringbuf[ksize*ithread + y1] = dst[(y1-halfk)*w + x];

        for (y1 = 0; y1 < h; y1++)
        {
            const int ibufi0_fix = (y1+ksize-1)%ksize;
            if(y1 < ymax)
                ringbuf[ibufi0_fix] = dst[(y1+halfk)*w + x];
            else
                ringbuf[ibufi0_fix] = dst[(h-1)*w + x];

            if (bufi0 == ksize) bufi0 = 0;
            dst[y1*w + x] = convolve(kernel, &ringbuf[ksize*ithread], ksize, bufi0);
        }
    }

    // clean up
    _mm_free(kernel);
    _mm_free(ringbuf);
}

int compare(float *dst1, float *dst2, const int n) {
    int error = 0;
    for(int i=0; i<n; i++) {
        if(*dst1 != *dst2) error++;
    }
    return error;
}


int main() {
    const int w = 20;
    const int h = 20;

    float *src =  (float*)_mm_malloc(w*h*sizeof(float),16);
    float *dst1 =  (float*)_mm_malloc(w*h*sizeof(float),16);
    float *dst2 =  (float*)_mm_malloc(w*h*sizeof(float),16);
    for(int i=0; i<w*h; i++) {
        src[i] = i;
    }

    gaussian_blur(src, dst1, w, h, 1.0f, 1);
    gaussian_blur(src, dst2, w, h, 1.0f, 4);
    int error = compare(dst1, dst2, w*h);
    printf("error %d\n", error);
    _mm_free(src);
    _mm_free(dst1);
    _mm_free(dst2);
}

编辑:这是ringbuf根据 Hristo 的评论在并行块内定义的代码。应该是等价的。

#include "math.h"
#include "omp.h"
#include "stdio.h"
#include <nmmintrin.h>

float convolve(const float *kernel, const float *ringbuf, const int ksize, const int bufi0) {
    float sum = 0.0f;
    for(int i=0; i<ksize; i++) {
        sum += kernel[i]*ringbuf[i];
    }
    return sum;
}


void gaussian_blur(float *src, float *dst, int w, int h, float sigma, int nthreads)
{
    int x, y, i;
    int ksize = (int)(sigma * 2.f * 4.f + 1) | 1;
    int halfk = ksize / 2;
    printf("ksize %d\n", ksize);
    float scale = -0.5f/(sigma*sigma);
    float sum = 0.f;
    float *kernel;
    int xmax = w - halfk;
    int ymax = h - halfk;

    // if sigma too small, just copy src to dst
    if (ksize <= 1)
    {
        for (y = 0; y < h; y++)
            for (x = 0; x < w; x++)
                dst[y*w + x] = src[y*w + x];
        return;
    }

    // create Gaussian kernel
    //kernel = malloc(ksize * sizeof(float));
    kernel =  (float*)_mm_malloc(ksize * sizeof(float),16);

    #pragma omp parallel for reduction(+ : sum) if(nthreads>1)
    for (i = 0; i < ksize; i++)
    {
        float x = (float)(i - halfk);
        float t = expf(scale * x * x);
        kernel[i] = t;
        sum += t;
    }

    scale = 1.f / sum;
    #pragma omp parallel for if(nthreads>1)
    for (i = 0; i < ksize; i++)
        kernel[i] *= scale;

    // blur each row
    //#pragma omp parallel for if(nthreads>1)// this is the for loop I parallelized but ended up with wrong output and running slower
    #pragma omp parallel if(nthreads>1) 
    {
        float *ringbuf = (float*)_mm_malloc(ksize * sizeof(float),16);
        #pragma omp for// this is the for loop I parallelized but ended up with wrong output and running slower
        for (y = 0; y < h; y++)
        {
            //printf("nthread %d\n", nthread);
            int x1;
            int bufi0 = ksize-1;
            float tmp = src[y*w + 0];
            for (x1 = 0; x1 < halfk  ; x1++) ringbuf[x1] = tmp;
            for (; x1 < ksize-1; x1++) ringbuf[x1] = src[y*w + x1-halfk];
            for (x1 = 0; x1 < w; x1++)
            {
                const int ibufi0_fix = (x1+ksize-1)%ksize;

                if(x1 < xmax)
                    ringbuf[ibufi0_fix] = src[y*w + x1+halfk];
                else
                    ringbuf[ibufi0_fix] = src[y*w + w-1];
                if (bufi0 == ksize) bufi0 = 0;
                dst[y*w + x1] = convolve(kernel, ringbuf, ksize, bufi0);
            }
        }
        _mm_free(ringbuf);
    }

    // blur each column
    #pragma omp parralel if(ntheads>1)
    {
        float *ringbuf = (float*)_mm_malloc(ksize * sizeof(float),16);
        #pragma omp for// this is the for loop I parallelized but ended up with wrong output and running slower
        for (x = 0; x < w; x++)
        {
            int y1;
            int bufi0 = ksize-1;
            float tmp = dst[0*w + x];
            for (y1 = 0; y1 < halfk  ; y1++) ringbuf[y1] = tmp;
            for (     ; y1 < ksize-1; y1++) ringbuf[y1] = dst[(y1-halfk)*w + x];

            for (y1 = 0; y1 < h; y1++)
            {
                const int ibufi0_fix = (y1+ksize-1)%ksize;
                if(y1 < ymax)
                    ringbuf[ibufi0_fix] = dst[(y1+halfk)*w + x];
                else
                    ringbuf[ibufi0_fix] = dst[(h-1)*w + x];

                if (bufi0 == ksize) bufi0 = 0;
                dst[y1*w + x] = convolve(kernel, ringbuf, ksize, bufi0);
            }
        }
        _mm_free(ringbuf);
    }
    // clean up
    _mm_free(kernel);
}

int compare(float *dst1, float *dst2, const int n) {
    int error = 0;
    for(int i=0; i<n; i++) {
        if(*dst1 != *dst2) error++;
    }
    return error;
}


int main() {
    const int w = 20;
    const int h = 20;

    float *src =  (float*)_mm_malloc(w*h*sizeof(float),16);
    float *dst1 =  (float*)_mm_malloc(w*h*sizeof(float),16);
    float *dst2 =  (float*)_mm_malloc(w*h*sizeof(float),16);
    for(int i=0; i<w*h; i++) {
        src[i] = i;
    }

    gaussian_blur(src, dst1, w, h, 1.0f, 1);
    gaussian_blur(src, dst2, w, h, 1.0f, 4);
    int error = compare(dst1, dst2, w*h);
    printf("error %d\n", error);
    _mm_free(src);
    _mm_free(dst1);
    _mm_free(dst2);
}
于 2013-05-18T14:11:06.087 回答