Skip to content

Movemask is only generated when N=4 #121691

@LaihoE

Description

@LaihoE

When N = 4:

#[inline(never)]
pub fn f(arr: &[u32; 8]) -> u64{
    let mut mask = 0;
    mask |= ((arr[0] == 42) as u64) << 0;
    mask |= ((arr[1] == 42) as u64) << 1;
    mask |= ((arr[2] == 42) as u64) << 2;
    mask |= ((arr[3] == 42) as u64) << 3;    
    mask
}

Produces:

.LCPI0_0:
        .long   42
example::f::h52e1c98f18490021:
        vpbroadcastd    xmm0, dword ptr [rip + .LCPI0_0]
        vpcmpeqd        xmm0, xmm0, xmmword ptr [rdi]
        vmovmskps       eax, xmm0
        ret

But with any other number of elements you get something along the lines of (N=8 here):

.LCPI0_0:
        .long   42
.LCPI0_1:
        .quad   2
        .quad   4
        .quad   8
        .quad   16
example::f::h52e1c98f18490021:
        xor     ecx, ecx
        cmp     dword ptr [rdi], 42
        sete    cl
        vpbroadcastd    xmm0, dword ptr [rip + .LCPI0_0]
        vpcmpeqd        xmm0, xmm0, xmmword ptr [rdi + 4]
        vpmovzxdq       ymm0, xmm0
        vpand   ymm0, ymm0, ymmword ptr [rip + .LCPI0_1]
        xor     edx, edx
        cmp     dword ptr [rdi + 20], 42
        sete    dl
        shl     edx, 5
        xor     esi, esi
        cmp     dword ptr [rdi + 24], 42
        sete    sil
        shl     esi, 6
        xor     eax, eax
        cmp     dword ptr [rdi + 28], 42
        sete    al
        shl     eax, 7
        or      eax, esi
        vextracti128    xmm1, ymm0, 1
        vpor    xmm0, xmm0, xmm1
        vpshufd xmm1, xmm0, 238
        vpor    xmm0, xmm0, xmm1
        vmovq   rsi, xmm0
        or      rax, rdx
        or      rax, rcx
        or      rax, rsi
        vzeroupper
        ret

Changing types/alignment/flags does not seem to help. First I thought it had to do with shift by 0 but as it works just fine with n=4 maybe something else is going on.

ir for N=4:

define noundef range(i64 0, 16) i64 @_ZN7example1f17h52e1c98f18490021E(ptr noalias nocapture noundef readonly align 4 dereferenceable(32) %arr) unnamed_addr #0 !dbg !7 {
  %0 = load <4 x i32>, ptr %arr, align 4, !dbg !12
  %1 = icmp eq <4 x i32> %0, <i32 42, i32 42, i32 42, i32 42>, !dbg !14
  %2 = extractelement <4 x i1> %1, i64 0, !dbg !15
  %_3 = zext i1 %2 to i64, !dbg !15
  %3 = extractelement <4 x i1> %1, i64 1, !dbg !16
  %_6 = select i1 %3, i64 2, i64 0, !dbg !16
  %4 = or disjoint i64 %_6, %_3, !dbg !17
  %5 = extractelement <4 x i1> %1, i64 2, !dbg !18
  %_10 = select i1 %5, i64 4, i64 0, !dbg !18
  %6 = or disjoint i64 %4, %_10, !dbg !19
  %7 = extractelement <4 x i1> %1, i64 3, !dbg !20
  %_14 = select i1 %7, i64 8, i64 0, !dbg !20
  %8 = or disjoint i64 %6, %_14, !dbg !21
  ret i64 %8, !dbg !22
}

ir for N=8:

define noundef range(i64 0, 256) i64 @example::f::h52e1c98f18490021(ptr noalias nocapture noundef readonly align 4 dereferenceable(32) %arr) unnamed_addr #0 !dbg !7 {
start:
  %_5 = load i32, ptr %arr, align 4, !dbg !12
  %_4 = icmp eq i32 %_5, 42, !dbg !14
  %_3 = zext i1 %_4 to i64, !dbg !15
  %0 = getelementptr inbounds i8, ptr %arr, i64 4, !dbg !16
  %1 = load <4 x i32>, ptr %0, align 4, !dbg !16
  %2 = icmp eq <4 x i32> %1, <i32 42, i32 42, i32 42, i32 42>, !dbg !17
  %3 = select <4 x i1> %2, <4 x i64> <i64 2, i64 4, i64 8, i64 16>, <4 x i64> zeroinitializer, !dbg !18
  %4 = getelementptr inbounds i8, ptr %arr, i64 20, !dbg !19
  %_25 = load i32, ptr %4, align 4, !dbg !19
  %_24 = icmp eq i32 %_25, 42, !dbg !20
  %_22 = select i1 %_24, i64 32, i64 0, !dbg !21
  %5 = getelementptr inbounds i8, ptr %arr, i64 24, !dbg !22
  %6 = load <2 x i32>, ptr %5, align 4, !dbg !22
  %7 = icmp eq <2 x i32> %6, <i32 42, i32 42>, !dbg !23
  %8 = select <2 x i1> %7, <2 x i64> <i64 64, i64 128>, <2 x i64> zeroinitializer, !dbg !24
  %9 = tail call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %3), !dbg !25
  %op.rdx = or disjoint i64 %9, %_22, !dbg !26
  %shift = shufflevector <2 x i64> %8, <2 x i64> poison, <2 x i32> <i32 1, i32 poison>, !dbg !25
  %10 = or <2 x i64> %8, %shift, !dbg !25
  %op.rdx9 = extractelement <2 x i64> %10, i64 0, !dbg !25
  %op.rdx10 = or disjoint i64 %op.rdx, %op.rdx9, !dbg !27
  %op.rdx11 = or disjoint i64 %op.rdx10, %_3, !dbg !28
  ret i64 %op.rdx11, !dbg !29
}

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions