Skip to content

clang x86 missed optimization for 2 dimension array accessed through always zero enum class #61615

Closed
@AMS21

Description

@AMS21

Given the following code Godbolt

enum class E {
    A = 0,
};

constexpr int t[1][1]{{1}};

int f1(E a) {
    return t[static_cast<int>(a)][static_cast<int>(a)];
}

int f2(E a, E b) {
    return t[static_cast<int>(a)][static_cast<int>(b)];
}

clang-trunk generates this assembly:

f1(E):                                # @f1(E)
        movsxd  rax, edi
        lea     rcx, [rip + t]
        lea     rcx, [rcx + 4*rax]
        mov     eax, dword ptr [rcx + 4*rax]
        ret
f2(E, E):                              # @f2(E, E)
        movsxd  rax, edi
        movsxd  rcx, esi
        lea     rdx, [rip + t]
        lea     rax, [rdx + 4*rax]
        mov     eax, dword ptr [rax + 4*rcx]
        ret
t:
        .long   1                               # 0x1

and this IR:

@_ZL1t = internal unnamed_addr constant [1 x [1 x i32]] [[1 x i32] [i32 1]], align 4, !dbg !0

define dso_local noundef i32 @_Z2f11E(i32 noundef %0) local_unnamed_addr #0 !dbg !23 {
  call void @llvm.dbg.value(metadata i32 %0, metadata !27, metadata !DIExpression()), !dbg !28
  %2 = sext i32 %0 to i64, !dbg !29
  %3 = getelementptr inbounds [1 x [1 x i32]], ptr @_ZL1t, i64 0, i64 %2, i64 %2, !dbg !29
  %4 = load i32, ptr %3, align 4, !dbg !29, !tbaa !30
  ret i32 %4, !dbg !34
}

define dso_local noundef i32 @_Z2f21ES_(i32 noundef %0, i32 noundef %1) local_unnamed_addr #0 !dbg !35 {
  call void @llvm.dbg.value(metadata i32 %0, metadata !39, metadata !DIExpression()), !dbg !41
  call void @llvm.dbg.value(metadata i32 %1, metadata !40, metadata !DIExpression()), !dbg !41
  %3 = sext i32 %0 to i64, !dbg !42
  %4 = sext i32 %1 to i64, !dbg !42
  %5 = getelementptr inbounds [1 x [1 x i32]], ptr @_ZL1t, i64 0, i64 %3, i64 %4, !dbg !42
  %6 = load i32, ptr %5, align 4, !dbg !42, !tbaa !30
  ret i32 %6, !dbg !43
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

While gcc-trunk generates just:

f1(E):
        mov     eax, 1
        ret
f2(E, E):
        mov     eax, 1
        ret

Interestingly using just a plain enum without static_cast's Clang generates the optimal code Godbolt. But with the static_cast's it still doesn't Godbolt.

Using __builtin_unreachable() to explicitly tell the compiler that a and b are E::A works for f1() but strangely not for f2() Godbolt.
But using __builtin_assume(...) works for both Godbolt.
Declaring the storage type of enum class E to unsigned short or unsigned char also fixes the problem Godbolt same goes for casting to unsigned short or unsigned char Godbolt.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions