-
Notifications
You must be signed in to change notification settings - Fork 14.5k
Open
Description
When N = 4:
#[inline(never)]
pub fn f(arr: &[u32; 8]) -> u64{
let mut mask = 0;
mask |= ((arr[0] == 42) as u64) << 0;
mask |= ((arr[1] == 42) as u64) << 1;
mask |= ((arr[2] == 42) as u64) << 2;
mask |= ((arr[3] == 42) as u64) << 3;
mask
}
Produces:
.LCPI0_0:
.long 42
example::f::h52e1c98f18490021:
vpbroadcastd xmm0, dword ptr [rip + .LCPI0_0]
vpcmpeqd xmm0, xmm0, xmmword ptr [rdi]
vmovmskps eax, xmm0
ret
But with any other number of elements you get something along the lines of (N=8 here):
.LCPI0_0:
.long 42
.LCPI0_1:
.quad 2
.quad 4
.quad 8
.quad 16
example::f::h52e1c98f18490021:
xor ecx, ecx
cmp dword ptr [rdi], 42
sete cl
vpbroadcastd xmm0, dword ptr [rip + .LCPI0_0]
vpcmpeqd xmm0, xmm0, xmmword ptr [rdi + 4]
vpmovzxdq ymm0, xmm0
vpand ymm0, ymm0, ymmword ptr [rip + .LCPI0_1]
xor edx, edx
cmp dword ptr [rdi + 20], 42
sete dl
shl edx, 5
xor esi, esi
cmp dword ptr [rdi + 24], 42
sete sil
shl esi, 6
xor eax, eax
cmp dword ptr [rdi + 28], 42
sete al
shl eax, 7
or eax, esi
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 238
vpor xmm0, xmm0, xmm1
vmovq rsi, xmm0
or rax, rdx
or rax, rcx
or rax, rsi
vzeroupper
ret
Changing types/alignment/flags does not seem to help. First I thought it had to do with shift by 0 but as it works just fine with n=4 maybe something else is going on.
ir for N=4:
define noundef range(i64 0, 16) i64 @_ZN7example1f17h52e1c98f18490021E(ptr noalias nocapture noundef readonly align 4 dereferenceable(32) %arr) unnamed_addr #0 !dbg !7 {
%0 = load <4 x i32>, ptr %arr, align 4, !dbg !12
%1 = icmp eq <4 x i32> %0, <i32 42, i32 42, i32 42, i32 42>, !dbg !14
%2 = extractelement <4 x i1> %1, i64 0, !dbg !15
%_3 = zext i1 %2 to i64, !dbg !15
%3 = extractelement <4 x i1> %1, i64 1, !dbg !16
%_6 = select i1 %3, i64 2, i64 0, !dbg !16
%4 = or disjoint i64 %_6, %_3, !dbg !17
%5 = extractelement <4 x i1> %1, i64 2, !dbg !18
%_10 = select i1 %5, i64 4, i64 0, !dbg !18
%6 = or disjoint i64 %4, %_10, !dbg !19
%7 = extractelement <4 x i1> %1, i64 3, !dbg !20
%_14 = select i1 %7, i64 8, i64 0, !dbg !20
%8 = or disjoint i64 %6, %_14, !dbg !21
ret i64 %8, !dbg !22
}
ir for N=8:
define noundef range(i64 0, 256) i64 @example::f::h52e1c98f18490021(ptr noalias nocapture noundef readonly align 4 dereferenceable(32) %arr) unnamed_addr #0 !dbg !7 {
start:
%_5 = load i32, ptr %arr, align 4, !dbg !12
%_4 = icmp eq i32 %_5, 42, !dbg !14
%_3 = zext i1 %_4 to i64, !dbg !15
%0 = getelementptr inbounds i8, ptr %arr, i64 4, !dbg !16
%1 = load <4 x i32>, ptr %0, align 4, !dbg !16
%2 = icmp eq <4 x i32> %1, <i32 42, i32 42, i32 42, i32 42>, !dbg !17
%3 = select <4 x i1> %2, <4 x i64> <i64 2, i64 4, i64 8, i64 16>, <4 x i64> zeroinitializer, !dbg !18
%4 = getelementptr inbounds i8, ptr %arr, i64 20, !dbg !19
%_25 = load i32, ptr %4, align 4, !dbg !19
%_24 = icmp eq i32 %_25, 42, !dbg !20
%_22 = select i1 %_24, i64 32, i64 0, !dbg !21
%5 = getelementptr inbounds i8, ptr %arr, i64 24, !dbg !22
%6 = load <2 x i32>, ptr %5, align 4, !dbg !22
%7 = icmp eq <2 x i32> %6, <i32 42, i32 42>, !dbg !23
%8 = select <2 x i1> %7, <2 x i64> <i64 64, i64 128>, <2 x i64> zeroinitializer, !dbg !24
%9 = tail call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %3), !dbg !25
%op.rdx = or disjoint i64 %9, %_22, !dbg !26
%shift = shufflevector <2 x i64> %8, <2 x i64> poison, <2 x i32> <i32 1, i32 poison>, !dbg !25
%10 = or <2 x i64> %8, %shift, !dbg !25
%op.rdx9 = extractelement <2 x i64> %10, i64 0, !dbg !25
%op.rdx10 = or disjoint i64 %op.rdx, %op.rdx9, !dbg !27
%op.rdx11 = or disjoint i64 %op.rdx10, %_3, !dbg !28
ret i64 %op.rdx11, !dbg !29
}