AVX 版本:
void floatSelect(float* f, const char* c, size_t n, char c_thresh) {
for (size_t i = 0; i < n; ++i) {
if (c[i] < c_thresh) f[i] = 0.0f;
else f[i] = 1.0f;
}
}
void vecFloatSelect(float* f, const char* c, size_t n, char c_thresh) {
const auto thresh = _mm_set1_epi8(c_thresh);
const auto zeros = _mm256_setzero_ps();
const auto ones = _mm256_set1_ps(1.0f);
const auto shuffle0 = _mm_set_epi8(3, -1, -1, -1, 2, -1, -1, -1, 1, -1, -1, -1, 0, -1, -1, -1);
const auto shuffle1 = _mm_set_epi8(7, -1, -1, -1, 6, -1, -1, -1, 5, -1, -1, -1, 4, -1, -1, -1);
const auto shuffle2 = _mm_set_epi8(11, -1, -1, -1, 10, -1, -1, -1, 9, -1, -1, -1, 8, -1, -1, -1);
const auto shuffle3 = _mm_set_epi8(15, -1, -1, -1, 14, -1, -1, -1, 13, -1, -1, -1, 12, -1, -1, -1);
const size_t nVec = (n / 16) * 16;
for (size_t i = 0; i < nVec; i += 16) {
const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i*>(c + i));
const auto mask = _mm_cmplt_epi8(chars, thresh);
const auto floatMask0 = _mm_shuffle_epi8(mask, shuffle0);
const auto floatMask1 = _mm_shuffle_epi8(mask, shuffle1);
const auto floatMask2 = _mm_shuffle_epi8(mask, shuffle2);
const auto floatMask3 = _mm_shuffle_epi8(mask, shuffle3);
const auto floatMask01 = _mm256_set_m128i(floatMask1, floatMask0);
const auto floatMask23 = _mm256_set_m128i(floatMask3, floatMask2);
const auto floats0 = _mm256_blendv_ps(ones, zeros, _mm256_castsi256_ps(floatMask01));
const auto floats1 = _mm256_blendv_ps(ones, zeros, _mm256_castsi256_ps(floatMask23));
_mm256_storeu_ps(f + i, floats0);
_mm256_storeu_ps(f + i + 8, floats1);
}
floatSelect(f + nVec, c + nVec, n % 16, c_thresh);
}