我正在研究高度“可向量化”的代码,并注意到关于 C++ __restrict 关键字/扩展 ~,即使在简单的情况下,Clang 的行为与 GCC 相比也是不同且不切实际的。

对于编译器生成的代码,减速大约是 15 倍(在我的具体情况下,不是下面的示例)。


struct Param {
    int *x;

int foo(int *a, int *b) {
    *a = 5;
    *b = 6;
    // No significant optimization here, as expected (for clang/gcc)
    return *a + *b;

int foo(Param a, Param b) {
    *a.x = 5;
    *b.x = 6;
    // No significant optimization here, as expected (for clang/gcc)
    return *a.x + *b.x;


struct ParamR {
    // "Restricted pointers assert that members point to disjoint storage"
    // https://en.cppreference.com/w/c/language/restrict, is restrict's 
    // interpretation for C can be used in C++ (for __restrict too ?) ?
    int *__restrict x;

int rfoo(int *__restrict a, int *__restrict b) {
    *a = 5;
    *b = 6;
    // Significant optimization here, as expected (for clang/gcc)
    return *a + *b;

int rfoo(ParamR a, ParamR b) {
    *a.x = 5;
    *b.x = 6;
    // No significant optimization here, NOT expected (clang fails?, gcc optimizes)
    return *a.x + *b.x;

int rfoo(ParamR *__restrict a, ParamR *__restrict b) {
    *a->x = 5;
    *b->x = 6;
    // No significant optimization here, NOT expected (clang fails?, gcc optimizes)
    return *a->x + *b->x;

C++ (__restrict) 和 C 代码(使用 std 限制)都会发生这种情况。

我怎样才能让 Clang 明白指针总是指向不相交的存储?


1 回答 1




int rfoo(int *__restrict a, int *__restrict b) {
    *a = 5;
    *b = 6;
    // Significant optimization here, as expected (for clang/gcc)
    return *a + *b;

// change this:
int rfoo(ParamR a, ParamR b) {
    *a.x = 5;
    *b.x = 6;
    // No significant optimization here, NOT expected (clang fails?, gcc optimizes)
    return *a.x + *b.x;

// to this:
int rfoo2(ParamR a, ParamR b) {
    return rfoo(a.x, b.x);

来自 clang 12.0.0 的输出:

rfoo(ParamR, ParamR):                       # @rfoo(ParamR, ParamR)
        mov     dword ptr [rdi], 5
        mov     dword ptr [rsi], 6
        mov     eax, dword ptr [rdi]
        add     eax, 6
rfoo2(ParamR, ParamR):                      # @rfoo2(ParamR, ParamR)
        mov     dword ptr [rdi], 5
        mov     dword ptr [rsi], 6
        mov     eax, 11

现在这很不方便,尤其是对于更复杂的代码,但是如果性能差异如此之大和重要,并且您无法更改为 gcc,则可能需要考虑这样做。

于 2021-12-13T19:33:21.093 回答