sse - avx 比 sse 多媒体扩展慢

Question

我正在编写一个完美的程序来与多媒体扩展并行。该程序包括转换图像，因此我遍历一个矩阵并修改其中的每个像素。为了更快地浏览，我使用多媒体扩展：

起初我使用 SSE3 扩展并实现了 2.5 的加速。接下来，我对它进行了编程，扩展了 sse 算法以使用 AVX 扩展（双尺寸向量），但我没有获得相对于 SSE3 的收益。使用 SSE 执行程序的时间或多或少与 AVX 相同。以下是分别针对 SSE 和 AVX 的代码摘要：

        for(i=0; i<lim; i+=12) { //tam vale n*n o n*n-12 dependiendo de si n*n es multiplo de 12. 12 ya que 3componentes*4pixeles(4 tamvector)
            vectorR = _mm_set_ps(matrix[i+9], matrix[i+6], matrix[i+3], matrix[i]);
            vectorG = _mm_set_ps(matrix[i+10], matrix[i+7], matrix[i+4], matrix[i+1]);
            vectorB = _mm_set_ps(matrix[i+11], matrix[i+8], matrix[i+5], matrix[i+2]);

            calcular_coeficientes_sse3(Ycoef0, Ycoef1, Ycoef2, Ucoef0, Vcoef0, vectorR, vectorG, vectorB,
                                    values0, values255, values128, &vectorY, &vectorU, &vectorV);

            _mm_store_ps(&y_aux[0], vectorY);
            _mm_store_ps(&u_aux[0], vectorU);
            _mm_store_ps(&v_aux[0], vectorV);

            //Colocamos los datos en la matriz
            //PIXEL 1
            matrix[i] = y_aux[0];
            matrix[i+1]=u_aux[0];
            matrix[i+2]=v_aux[0];
            //PIXEL 2
            matrix[i+3]=y_aux[1];
            matrix[i+4]=u_aux[1];
            matrix[i+5]=v_aux[1];
            //PIXEL 3
            matrix[i+6] = y_aux[2];;
            matrix[i+7]=u_aux[2];
            matrix[i+8]=v_aux[2];
            //PIXEL 4
            matrix[i+9]=y_aux[3];
            matrix[i+10]=u_aux[3];
            matrix[i+11]=v_aux[3];
    }

 for(i=0; i<lim; i+=24) { //Vamos de 8 en 8 pixeles
            vectorR = _mm256_set_ps(matrix[i+21], matrix[i+18], matrix[i+15] ,matrix[i+12],matrix[i+9], matrix[i+6], matrix[i+3], matrix[i]);
            vectorG = _mm256_set_ps(matrix[i+22], matrix[i+19], matrix[i+16], matrix[i+13], matrix[i+10], matrix[i+7], matrix[i+4], matrix[i+1]);
            vectorB = _mm256_set_ps(matrix[i+23], matrix[i+20], matrix[i+17], matrix[i+14], matrix[i+11], matrix[i+8], matrix[i+5], matrix[i+2]);

            calcular_coeficientes_avx(Ycoef0, Ycoef1, Ycoef2, Ucoef0, Vcoef0, vectorR, vectorG, vectorB,
                                    values0, values255, values128, &vectorY, &vectorU, &vectorV);

            _mm256_store_ps(&y_aux[0], vectorY);
            _mm256_store_ps(&u_aux[0], vectorU);
            _mm256_store_ps(&v_aux[0], vectorV);

            //Colocamos los datos en la matriz
            //PIXEL 1
            matrix[i] = y_aux[0];
            matrix[i+1]=u_aux[0];
            matrix[i+2]=v_aux[0];
            //PIXEL 2
            matrix[i+3]=y_aux[1];
            matrix[i+4]=u_aux[1];
            matrix[i+5]=v_aux[1];
            //PIXEL 3
            matrix[i+6] = y_aux[2];;
            matrix[i+7]=u_aux[2];
            matrix[i+8]=v_aux[2];
            //PIXEL 4
            matrix[i+9]=y_aux[3];
            matrix[i+10]=u_aux[3];
            matrix[i+11]=v_aux[3];
            //PIXEL 5
            matrix[i+12]=y_aux[4];
            matrix[i+13]=u_aux[4];
            matrix[i+14]=v_aux[4];
            //PIXEL 6
            matrix[i+15]=y_aux[5];
            matrix[i+16]=u_aux[5];
            matrix[i+17]=v_aux[5];
            //PIXEL 7
            matrix[i+18]=y_aux[6];
            matrix[i+19]=u_aux[6];
            matrix[i+20]=v_aux[6];
            //PIXEL 8
            matrix[i+21]=y_aux[7];
            matrix[i+22]=u_aux[7];
            matrix[i+23]=v_aux[7];
    }

void calcular_coeficientes_sse3(__m128 Ycoef0, __m128 Ycoef1, __m128 Ycoef2, __m128 Ucoef0, __m128 Vcoef0, __m128 vectorR,
                            __m128 vectorG, __m128 vectorB, __m128 values0, __m128 values255, __m128 values128,
                                     __m128 *vectorY, __m128 *vectorU, __m128 *vectorV) {

    //CALCULO DE Y3, Y2, Y1, Y0 (Cuatro píxeles consecutivos)
                                                    //PRIMERA VUELta
    __m128 valores1 = _mm_mul_ps(Ycoef0, vectorR);  // valores1 = (0.299*R[3], 0.299*R[2], 0.299*R[1], 0.299*R[0])
    __m128 valores2 = _mm_mul_ps(Ycoef1, vectorG);  // valores2 = (0.587*G[3], 0.587*G[2], 0.587*G[1], 0.587*G[0])
    __m128 valores3 = _mm_mul_ps(Ycoef2, vectorB);  // valores3 = (0.114*B[3], 0.114*B[2], 0.114*B[1], 0.114*B[0]);
    valores1 = _mm_add_ps(valores1, valores2);      // valores1 = (0.299*R[3] + 0.587*G[3], 0.299*R[2] + 0.587*G[2], 0.299*G[1]+ ..., ...)
    *vectorY = _mm_add_ps(valores1, valores3);      // vectorY = (Y[3], Y[2], Y[1], Y[0])
    *vectorY = _mm_floor_ps(*vectorY);
    //Calculo de U3, U2, U1, U0
    //B-Y
    valores1 = _mm_sub_ps(vectorB, *vectorY);      // valores1 = (B[3]-Y[3], B[2]-Y[2], B[1]-Y[1], B[0]-Y[0])
    valores1 = _mm_mul_ps(Ucoef0, valores1);       // valores1 = (0.492*(B[3]-Y[3]), 0.492*(B[2]-Y[2]), 0.492*(B[1]-Y[1]), 0.492*(...)) 
    *vectorU = _mm_add_ps(valores1, values128);    // vectorU  = (U[3], U[2], U[1], U[0])
    //CALCULO DE V3, V2, V1, V0
    // R-Y
    valores1 = _mm_sub_ps(vectorR, *vectorY);      // valores1 = (R[3]-Y[3], R[2]-Y[2], R[1]-Y[1], R[0]-Y[0])
    valores1 = _mm_mul_ps(Vcoef0, valores1);       // valores1 = (0.877*(R[3]-Y[3]), 0.877*(R[2]-Y[2]), 0.877*(R[1]-Y[1]), 0.877*(...))
    valores1 = _mm_add_ps(valores1, values128);    // valores1 = (0.877*(R[3]-Y[3]) + 128, 0.877*(R[2]-Y[2]) + 128, ..., ...)
    //valores1 pueden necesitar saturacion. 

    //SATURACIONES a 0
    //Para evitar hacer comparaciones cogemos el mayor entre 0 y el valor V[i]:
                                                            // Si V[i] > 0 se queda con V[i] pues es mayor que 0.
                                                            // Si V[i] < 0 se queda con 0 pues es mayor que un número negativo.
    valores1 = _mm_max_ps(valores1, values0);      // valores1 = (max(0.877*(R[3]-Y[3]) + 128,0), ..., ..., ...)

    // SATURACIONES a 255
    //Para evitar hacer comparacion cogemos el menor entre 255 y el valor V[i]
                                                            // Si V[i] < 255 entonces se queda con el menor, V[i]
                                                            // Si V[i] > 255 entonces se queda con el menor, 255.
    *vectorV = _mm_min_ps(valores1, values255);  //vectorV = (V[3], V[2], V[1], V[0])

    //NOTA: Al estar las operaciones implementadas en hardware se hacen las operaciones max y min en un 1 ciclo. 
    //por lo que solo en dos ciclos comprobamos la saturacion de 4 valores V.

    return; //El procedimiento termina devolviendo vectorY, vectorU y vectorV

}

void calcular_coeficientes_avx(__m256 Ycoef0, __m256 Ycoef1, __m256 Ycoef2, __m256 Ucoef0, __m256 Vcoef0, __m256 vectorR,
                            __m256 vectorG, __m256 vectorB, __m256 values0, __m256 values255, __m256 values128,
                                     __m256 *vectorY, __m256 *vectorU, __m256 *vectorV) {

    //CALCULO DE Y7, Y6, Y5, Y4, Y3, Y2, Y1, Y0 (Cuatro píxeles consecutivos)

    __m256 valores1 = _mm256_mul_ps(Ycoef0, vectorR);
    __m256 valores2 = _mm256_mul_ps(Ycoef1, vectorG);
    __m256 valores3 = _mm256_mul_ps(Ycoef2, vectorB);
    valores1 = _mm256_add_ps(valores1, valores2);
    *vectorY = _mm256_add_ps(valores1, valores3);
    *vectorY = _mm256_floor_ps(*vectorY);
    //Calculo de U7, U6, U5, U4, U3, U2, U1, U0
    valores1 = _mm256_sub_ps(vectorB, *vectorY);
    valores1 = _mm256_mul_ps(Ucoef0, valores1);
    *vectorU = _mm256_add_ps(valores1, values128);
    //CALCULO DE V7, V6, V5, V4, V3, V2, V1, V0
    // R-Y
    valores1 = _mm256_sub_ps(vectorR, *vectorY);
    valores1 = _mm256_mul_ps(Vcoef0, valores1);
    valores1 = _mm256_add_ps(valores1, values128);
    //valores1 pueden necesitar saturacion. 

    //SATURACIONES a 0
    //Para evitar hacer comparaciones cogemos el mayor entre 0 y el valor V[i]:


    valores1 = _mm256_max_ps(valores1, values0);

    // SATURACIONES a 255
    //Para evitar hacer comparacion cogemos el menor entre 255 y el valor V[i]
                                                            // Si V[i] < 255 entonces se queda con el menor, V[i]
                                                            // Si V[i] > 255 entonces se queda con el menor, 255.
    *vectorV = _mm256_min_ps(valores1, values255);  //vectorV = (V[3], V[2], V[1], V[0])

    //NOTA: Al estar las operaciones implementadas en hardware se hacen las operaciones max y min en un 1 ciclo. 
    //por lo que solo en dos ciclos comprobamos la saturacion de 4 valores V.

    return; //El procedimiento termina devolviendo vectorY, vectorU y vectorV

}

如您所见， sse 和 avx 相同，但最后一个已扩展并使用更长的向量。为什么执行时间相同？

注意：我已经在两台支持 AVX 的不同计算机上进行了尝试（显然），我也遇到了同样的问题。

非常感谢。

score 2 · Accepted Answer

对 RGB 到 YUV 使用浮点算法是大材小用，所以我将在这里使用定点算法。输出格式很烦人，可能有一种比我使用的更聪明的方法可以将这些东西打乱到位，但无论你做什么，当你想对 YUV 数据做任何事情时，它也会再次烦人。使用诸如 [YYYYYYYY UUUUUUUU VVVVVVVV] 之类的格式会更有意义。

虽然已经有许多 RGB 到 YUV 转换器可用，但我还是试了一次。有更多可用的替代方案不会有什么坏处，而且它们中的大多数都不完全匹配 - 例如进行色度子采样并通常使用其他输出格式。

我没有测试它（除了检查它是否编译），所以任何关于正确性或性能的评论都将不胜感激。

void toYUV(uint8_t* pixels, int size)
{
  // r g b r  g b r g  b r g b  r g b r | g b r g  b r g b

  short rw = 9798; // (short)round(0.299 * 2^15)
  short gw = 19234;// (short)round(0.587 * 2^15)
  short bw = 3736; // (short)round(0.114 * 2^15)
  char _ = -1;

  __m128i z = _mm_setzero_si128();

  for (int i = 0; i < size; i += 24)
  {
    __m128i p0 = _mm_loadu_si128((__m128i*)(pixels + i));
    __m128i p1 = _mm_loadl_epi64((__m128i*)(pixels + i + 16));

    // widen
    // w0 = r0 g0 b0 r1  g1 b1 r2 g2
    __m128i w0 = _mm_unpacklo_epi8(p0, z);
    // w1 = b2 r3 g3 b3  r4 g4 b4 r5
    __m128i w1 = _mm_unpackhi_epi8(p0, z);
    // w2 = g5 b5 r6 g6  b6 r7 g7 b7
    __m128i w2 = _mm_unpacklo_epi8(p1, z);

    __m128i zRGBRGBz = _mm_setr_epi16(0, rw, gw, bw, rw, gw, bw, 0);
    // calculate Y
    // 0+r0 g0+b0  r1+g1 b1+0
    __m128i s0 = _mm_madd_epi16(_mm_bslli_si128(w0, 2), zRGBRGBz);
    // 0+r2 g2+b2  r3+g3 b3+0
    __m128i s1 = _mm_madd_epi16(_mm_alignr_epi8(w1, w0, 10), zRGBRGBz);
    // 0+r4 g4+b4  r5+g5 b5+0
    __m128i s2 = _mm_madd_epi16(_mm_alignr_epi8(w2, w1, 6), zRGBRGBz);
    // 0+r6 g6+b6  r7+g7 b7+0
    __m128i s3 = _mm_madd_epi16(_mm_bsrli_si128(w2, 2), zRGBRGBz);
    // the math works out to make the Y's in [0 .. 254] after scaling back, so
    // y0 = y0 0 y1 0 y2 0 y3 0
    // y1 = y4 0 y5 0 y6 0 y7 0
    __m128i y0 = _mm_srli_epi32(_mm_hadd_epi32(s0, s1), 15);
    __m128i y1 = _mm_srli_epi32(_mm_hadd_epi32(s2, s3), 15);
    __m128i y = _mm_packus_epi32(y0, y1);
    y = _mm_packus_epi16(y, y);

    // calculate U
    // todo: do smarter shuffles?
    // b0 = 0 b0 0 b1 0 b2 0 b3
    __m128i b0 = _mm_or_si128(
      _mm_shuffle_epi8(w0, _mm_setr_epi8(_, _, 4, 5, _, _, 10, 11, _, _, _, _, _, _, _, _)),
      _mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, _, _, _, _,  _,  _, _, _, 0, 1, _, _, 6, 7)));
    // b1 = 0 b4 0 b5 0 b6 0 b7
    __m128i b1 = _mm_or_si128(
      _mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, 13, 14, _, _, _, _, _, _, _, _, _, _,  _, _)),
      _mm_shuffle_epi8(w2, _mm_setr_epi8(_, _,  _,  _, _, _, 2, 3, _, _, 8, 9, _, _, 14, 15)));
    // b - y in [-225 .. 226]
    // u = 18492 * (b - y) >> 15 -> (18492 * b + -18492 * y) >> 15
    // .. so u in [-128 .. 127]
    short us = 18492;
    __m128i u0 = _mm_madd_epi16(_mm_or_si128(b0, y0), _mm_setr_epi16(-us, us, -us, us, -us, us, -us, us));
    u0 = _mm_srai_epi32(u0, 15);
    __m128i u1 = _mm_madd_epi16(_mm_or_si128(b1, y1), _mm_setr_epi16(-us, us, -us, us, -us, us, -us, us));
    u1 = _mm_srai_epi32(u1, 15);
    // pack to sbytes
    __m128i u = _mm_packs_epi32(u0, u1);
    u = _mm_packs_epi16(u, u);

    // calculate V
    // todo: do smarter shuffles?
    // r0 = 0 r0 0 r1 0 r2 0 r3
    __m128i r0 = _mm_or_si128(
      _mm_shuffle_epi8(w0, _mm_setr_epi8(_, _, 0, 1, _, _, 6, 7, _, _, 12, 13, _, _, _, _)),
      _mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _,  _,  _, _, _, 2, 3)));
    // r1 = 0 r4 0 r5 0 r6 0 r7
    __m128i r1 = _mm_or_si128(
      _mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, 8, 9, _, _, 14, 15, _, _, _, _, _, _,  _, _)),
      _mm_shuffle_epi8(w2, _mm_setr_epi8(_, _, _, _, _, _,  _,  _, _, _, 4, 5, _, _, 10, 11)));
    // r - y in [-178 .. 179]
    // v = 23372 * (r - y) >> 15 -> (23372 * r + -23372 * y) >> 15
    // .. so v in [-128 .. 127]
    short vs = 23372;
    __m128i v0 = _mm_madd_epi16(_mm_or_si128(r0, y0), _mm_setr_epi16(-vs, vs, -vs, vs, -vs, vs, -vs, vs));
    v0 = _mm_srai_epi32(v0, 15);
    __m128i v1 = _mm_madd_epi16(_mm_or_si128(r1, y1), _mm_setr_epi16(-vs, vs, -vs, vs, -vs, vs, -vs, vs));
    v1 = _mm_srai_epi32(v1, 15);
    // pack to sbytes
    __m128i v = _mm_packs_epi32(v0, v1);
    v = _mm_packs_epi16(v, v);

    // interleave :(
    // y0 u0 v0 y1  u1 v1 y2 u2  v2 y3 u3 v3  y4 u4 v4 y5
    // u5 v5 y6 u6  v6 y7 u7 v7
    __m128i shuf0 = _mm_setr_epi8(0, _, _, 1, _, _, 2, _, _, 3, _, _, 4, _, _, 5);
    p0 = _mm_or_si128(_mm_or_si128(
      _mm_shuffle_epi8(y, shuf0),
      _mm_shuffle_epi8(u, _mm_bslli_si128(shuf0, 1))),
      _mm_shuffle_epi8(v, _mm_bslli_si128(shuf0, 2)));
    __m128i shuf1 = _mm_setr_epi8(_, 5, _, _, 6, _, _, 7, _, _, _, _, _, _, _, _);
    p1 = _mm_or_si128(_mm_or_si128(
      _mm_shuffle_epi8(y, _mm_srli_si128(shuf1, 2)),
      _mm_shuffle_epi8(u, _mm_srli_si128(shuf1, 1))),
      _mm_shuffle_epi8(v, shuf1));

    _mm_storeu_si128((__m128i*)(pixels + i), p0);
    _mm_storel_epi64((__m128i*)(pixels + i + 16), p1);
  }
}

这不是很好，所有这些洗牌和（非）包装都太重了p5。在常春藤上，这不是一个问题。使用较少交错的输出格式将有很大帮助，节省 6 次随机播放。当然，由于这是定点，因此您无法将其真正转换为 AVX，但您可以尝试 AVX2。

这是一个“更垂直”的版本，基于pmulhrsw而不是pmaddwd，也未测试（它可能已损坏）。asm 看起来更好，但我也没有测试过性能。

void toYUV(uint8_t* pixels, int size)
{
  // r g b r  g b r g  b r g b  r g b r | g b r g  b r g b
  char _ = -1;
  __m128i rshuf0 = _mm_setr_epi8(0, _, 3, _, 6, _,  9, _, 12, _, 15, _, _, _, _, _);
  __m128i rshuf1 = _mm_setr_epi8(_, _, _, _, _, _,  _, _,  _, _,  _, _, 2, _, 5, _);
  __m128i gshuf0 = _mm_setr_epi8(1, _, 4, _, 7, _, 10, _, 13, _,  _, _, _, _, _, _);
  __m128i gshuf1 = _mm_setr_epi8(_, _, _, _, _, _,  _, _,  _, _,  0, _, 3, _, 6, _);
  __m128i bshuf0 = _mm_setr_epi8(2, _, 5, _, 8, _, 11, _, 14, _,  _, _, _, _, _, _);
  __m128i bshuf1 = _mm_setr_epi8(_, _, _, _, _, _,  _, _,  _, _,  1, _, 2, _, 5, _);

  __m128i ryweight = _mm_set1_epi16(9798); // (short)round(0.299 * 2^15)
  __m128i gyweight = _mm_set1_epi16(19235);// (short)round(0.587 * 2^15)
  __m128i byweight = _mm_set1_epi16(3736); // (short)round(0.114 * 2^15)
  __m128i uscale  = _mm_set1_epi16(18492); // (short)floor(2^15 / (2^15 - byweight) * 0.5 * 2^15)
  __m128i uofs = _mm_set1_epi16(128);
  __m128i vscale  = _mm_set1_epi16(23372); // (short)floor(2^15 / (2^15 - ryweight) * 0.5 * 2^15)
  __m128i vofs = _mm_set1_epi16(128);

  __m128i yshuf0 = _mm_setr_epi8(0, _, _, 2, _, _, 4, _, _, 6, _, _, 8, _, _, _);
  __m128i yshuf1 = _mm_setr_epi8(10, _, _, 12, _, _, 14, _, _, _, _, _, _, _, _, _);
  __m128i ushuf0 = _mm_setr_epi8(_, 0, _, _, 1, _, _, 2, _, _, 3, _, _, 4, _, _);
  __m128i ushuf1 = _mm_setr_epi8(5, _, _, 6, _, _, 7, _, _, _, _, _, _, _, _, _);

  for (int i = 0; i < size; i += 24)
  {
    __m128i p0 = _mm_loadu_si128((__m128i*)(pixels + i));
    __m128i p1 = _mm_loadl_epi64((__m128i*)(pixels + i + 16));

    __m128i r = _mm_or_si128(_mm_shuffle_epi8(p0, rshuf0), _mm_shuffle_epi8(p1, rshuf1));
    __m128i g = _mm_or_si128(_mm_shuffle_epi8(p0, gshuf0), _mm_shuffle_epi8(p1, gshuf1));
    __m128i b = _mm_or_si128(_mm_shuffle_epi8(p0, bshuf0), _mm_shuffle_epi8(p1, bshuf1));

    __m128i scaledr = _mm_mulhrs_epi16(r, ryweight);
    __m128i scaledg = _mm_mulhrs_epi16(g, gyweight);
    __m128i scaledb = _mm_mulhrs_epi16(b, byweight);
    __m128i y = _mm_add_epi16(_mm_add_epi16(scaledr, scaledg), scaledb);
    __m128i u = _mm_mulhrs_epi16(_mm_sub_epi16(b, y), uscale);
    __m128i v = _mm_mulhrs_epi16(_mm_sub_epi16(r, y), vscale);
    // pack back to bytes and shuffle
    // words in y are guaranteed to be in [0 - 255] so just shuffle them into place
    // swords in u and v may be slightly wonky so pack with saturation first
    u = _mm_packs_epi16(u, u);
    v = _mm_packs_epi16(v, v);
    p0 = _mm_or_si128(_mm_or_si128(
      _mm_shuffle_epi8(y, yshuf0),
      _mm_shuffle_epi8(u, ushuf0)),
      _mm_shuffle_epi8(v, _mm_bslli_si128(ushuf0, 1)));
    p1 = _mm_or_si128(_mm_or_si128(
      _mm_shuffle_epi8(y, yshuf1),
      _mm_shuffle_epi8(u, ushuf1)),
      _mm_shuffle_epi8(v, _mm_bslli_si128(ushuf1, 1)));

    _mm_storeu_si128((__m128i*)(pixels + i), p0);
    _mm_storel_epi64((__m128i*)(pixels + i + 16), p1);
  }
}

sse - avx 比 sse 多媒体扩展慢

1 回答 1

Related

Reference